From fd986379eef48d89afea07825808d92c35446ff8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Wed, 29 Feb 2012 15:22:34 +0100
Subject: [PATCH] - Update Xen patches to 3.3-rc5 and c/s 1157. - config.conf:
 Re-enable Xen configs. - Update config files.

suse-commit: f22562233702e32624577b9254075e086c8da001
---
 .../ABI/testing/sysfs-kernel-mm-cleancache         |   11 -
 Documentation/kernel-parameters.txt                |   29 +
 Documentation/vm/cleancache.txt                    |   41 +-
 Documentation/vm/frontswap.txt                     |  210 +
 arch/ia64/Kconfig                                  |    2 +-
 arch/ia64/Makefile                                 |    2 +-
 arch/ia64/include/asm/xen/hypervisor.h             |    4 +-
 arch/ia64/include/asm/xen/interface.h              |   25 +-
 arch/ia64/kernel/asm-offsets.c                     |    2 +-
 arch/ia64/kernel/vmlinux.lds.S                     |    2 +-
 arch/ia64/xen/Kconfig                              |    8 +-
 arch/ia64/xen/xcom_hcall.c                         |    2 +-
 arch/x86/Kbuild                                    |    2 +-
 arch/x86/Kconfig                                   |  173 +-
 arch/x86/Kconfig.cpu                               |   15 +-
 arch/x86/Kconfig.debug                             |    6 +-
 arch/x86/Makefile                                  |   22 +-
 arch/x86/boot/Makefile                             |   17 +-
 arch/x86/ia32/ia32entry-xen.S                      |  383 ++
 arch/x86/include/asm/acpi.h                        |   33 +
 arch/x86/include/asm/agp.h                         |    3 +
 arch/x86/include/asm/apic.h                        |   26 +
 arch/x86/include/asm/apicdef.h                     |   16 +
 arch/x86/include/asm/boot.h                        |    2 +-
 arch/x86/include/asm/cpufeature.h                  |    4 +
 arch/x86/include/asm/debugreg.h                    |    2 +-
 arch/x86/include/asm/e820.h                        |    4 +
 arch/x86/include/asm/hardirq.h                     |    4 +
 arch/x86/include/asm/hw_irq.h                      |   10 +
 arch/x86/include/asm/hypervisor.h                  |    4 +
 arch/x86/include/asm/i8259.h                       |    2 +
 arch/x86/include/asm/io.h                          |    4 +-
 arch/x86/include/asm/kexec.h                       |   29 +
 arch/x86/include/asm/mach_traps.h                  |   25 +
 arch/x86/include/asm/mc146818rtc.h                 |    2 +-
 arch/x86/include/asm/mmu.h                         |    5 +-
 arch/x86/include/asm/nmi.h                         |    3 +
 arch/x86/include/asm/page_64_types.h               |    8 +
 arch/x86/include/asm/ptrace.h                      |   14 +-
 arch/x86/include/asm/required-features.h           |    2 +-
 arch/x86/include/asm/segment.h                     |    4 +-
 arch/x86/include/asm/thread_info.h                 |    8 +
 arch/x86/include/asm/topology.h                    |    2 +-
 arch/x86/include/asm/trampoline.h                  |    2 +-
 arch/x86/include/asm/traps.h                       |    6 +
 arch/x86/include/asm/uv/uv_hub.h                   |    2 +-
 arch/x86/include/asm/xen/hypercall.h               |    1 +
 arch/x86/include/asm/xen/hypervisor.h              |    2 +-
 arch/x86/include/asm/xen/interface.h               |   26 +-
 arch/x86/include/mach-xen/asm/agp.h                |   58 +
 arch/x86/include/mach-xen/asm/cmpxchg.h            |   11 +
 arch/x86/include/mach-xen/asm/cmpxchg_32.h         |   24 +
 arch/x86/include/mach-xen/asm/cmpxchg_64.h         |   11 +
 arch/x86/include/mach-xen/asm/desc.h               |  433 ++
 arch/x86/include/mach-xen/asm/dma-mapping.h        |   25 +
 arch/x86/include/mach-xen/asm/fixmap.h             |  240 ++
 arch/x86/include/mach-xen/asm/gnttab_dma.h         |   41 +
 arch/x86/include/mach-xen/asm/highmem.h            |   98 +
 arch/x86/include/mach-xen/asm/hypercall.h          |  439 ++
 arch/x86/include/mach-xen/asm/hypercall_32.h       |   62 +
 arch/x86/include/mach-xen/asm/hypercall_64.h       |   54 +
 arch/x86/include/mach-xen/asm/hypervisor.h         |  392 ++
 arch/x86/include/mach-xen/asm/i387.h               |   55 +
 arch/x86/include/mach-xen/asm/io.h                 |  343 ++
 arch/x86/include/mach-xen/asm/ipi.h                |   13 +
 arch/x86/include/mach-xen/asm/irq_vectors.h        |   98 +
 arch/x86/include/mach-xen/asm/irqflags.h           |  212 +
 arch/x86/include/mach-xen/asm/mach_traps.h         |   37 +
 arch/x86/include/mach-xen/asm/maddr.h              |  155 +
 arch/x86/include/mach-xen/asm/maddr_32.h           |   35 +
 arch/x86/include/mach-xen/asm/maddr_64.h           |   21 +
 arch/x86/include/mach-xen/asm/mmu_context.h        |  165 +
 arch/x86/include/mach-xen/asm/mutex.h              |    3 +
 arch/x86/include/mach-xen/asm/pci.h                |  180 +
 arch/x86/include/mach-xen/asm/percpu.h             |   61 +
 arch/x86/include/mach-xen/asm/perf_event.h         |   42 +
 arch/x86/include/mach-xen/asm/pgalloc.h            |  159 +
 arch/x86/include/mach-xen/asm/pgtable-3level.h     |  152 +
 .../include/mach-xen/asm/pgtable-3level_types.h    |   44 +
 arch/x86/include/mach-xen/asm/pgtable.h            |  885 +++++
 arch/x86/include/mach-xen/asm/pgtable_32.h         |   89 +
 arch/x86/include/mach-xen/asm/pgtable_64.h         |  203 +
 arch/x86/include/mach-xen/asm/pgtable_64_types.h   |   64 +
 arch/x86/include/mach-xen/asm/pgtable_types.h      |  392 ++
 arch/x86/include/mach-xen/asm/probe_roms.h         |   10 +
 arch/x86/include/mach-xen/asm/processor.h          |  978 +++++
 arch/x86/include/mach-xen/asm/setup.h              |   21 +
 arch/x86/include/mach-xen/asm/smp-processor-id.h   |   36 +
 arch/x86/include/mach-xen/asm/smp.h                |  241 ++
 arch/x86/include/mach-xen/asm/spinlock.h           |  379 ++
 arch/x86/include/mach-xen/asm/spinlock_types.h     |   62 +
 arch/x86/include/mach-xen/asm/swiotlb.h            |    8 +
 arch/x86/include/mach-xen/asm/system.h             |  520 +++
 arch/x86/include/mach-xen/asm/time.h               |   18 +
 arch/x86/include/mach-xen/asm/tlbflush.h           |  114 +
 arch/x86/include/mach-xen/asm/vga.h                |   20 +
 arch/x86/include/mach-xen/asm/xenoprof.h           |   48 +
 arch/x86/include/mach-xen/asm/xor.h                |    8 +
 arch/x86/include/mach-xen/asm/xor_64.h             |  339 ++
 arch/x86/kernel/Makefile                           |    7 +
 arch/x86/kernel/acpi/Makefile                      |    4 +
 arch/x86/kernel/acpi/boot.c                        |   25 +-
 arch/x86/kernel/acpi/processor_extcntl_xen.c       |  287 ++
 arch/x86/kernel/amd_nb.c                           |    6 +
 arch/x86/kernel/apic/Makefile                      |    4 +
 arch/x86/kernel/apic/apic-xen.c                    |   69 +
 arch/x86/kernel/apic/hw_nmi.c                      |    8 +
 arch/x86/kernel/apic/io_apic-xen.c                 | 4199 ++++++++++++++++++++
 arch/x86/kernel/apic/ipi-xen.c                     |   43 +
 arch/x86/kernel/apic/probe_32-xen.c                |   57 +
 arch/x86/kernel/asm-offsets.c                      |    4 +-
 arch/x86/kernel/asm-offsets_32.c                   |   14 +-
 arch/x86/kernel/asm-offsets_64.c                   |    2 +
 arch/x86/kernel/cpu/Makefile                       |    3 +
 arch/x86/kernel/cpu/amd.c                          |   18 +-
 arch/x86/kernel/cpu/bugs.c                         |    6 +
 arch/x86/kernel/cpu/bugs_64.c                      |    2 +
 arch/x86/kernel/cpu/common-xen.c                   | 1435 +++++++
 arch/x86/kernel/cpu/intel.c                        |   17 +
 arch/x86/kernel/cpu/intel_cacheinfo.c              |   17 +-
 arch/x86/kernel/cpu/mcheck/Makefile                |    1 +
 arch/x86/kernel/cpu/mcheck/mce-inject.c            |    4 +-
 arch/x86/kernel/cpu/mcheck/mce.c                   |   28 +
 arch/x86/kernel/cpu/mcheck/mce_dom0.c              |  185 +
 arch/x86/kernel/cpu/mtrr/Makefile                  |    1 +
 arch/x86/kernel/cpu/mtrr/main-xen.c                |  326 ++
 arch/x86/kernel/cpu/proc.c                         |    8 +-
 arch/x86/kernel/cpu/scattered.c                    |    2 +
 arch/x86/kernel/cpu/topology.c                     |    2 +-
 arch/x86/kernel/dumpstack_64.c                     |    4 +
 arch/x86/kernel/e820-xen.c                         | 1291 ++++++
 arch/x86/kernel/early_printk-xen.c                 |  291 ++
 arch/x86/kernel/entry_32-xen.S                     | 1722 ++++++++
 arch/x86/kernel/entry_32.S                         |   10 +-
 arch/x86/kernel/entry_64-xen.S                     | 1385 +++++++
 arch/x86/kernel/entry_64.S                         |    6 +-
 arch/x86/kernel/fixup.c                            |   89 +
 arch/x86/kernel/head-xen.c                         |  223 ++
 arch/x86/kernel/head32-xen.c                       |  103 +
 arch/x86/kernel/head64-xen.c                       |  146 +
 arch/x86/kernel/head_32-xen.S                      |  220 +
 arch/x86/kernel/head_64-xen.S                      |  176 +
 arch/x86/kernel/init_task.c                        |    3 +-
 arch/x86/kernel/ioport-xen.c                       |   84 +
 arch/x86/kernel/irq-xen.c                          |  350 ++
 arch/x86/kernel/irq_work-xen.c                     |   21 +
 arch/x86/kernel/ldt-xen.c                          |  272 ++
 arch/x86/kernel/machine_kexec_32.c                 |  107 +-
 arch/x86/kernel/machine_kexec_64.c                 |  178 +-
 arch/x86/kernel/machine_kexec_xen.c                |   29 +
 arch/x86/kernel/microcode_core-xen.c               |  299 ++
 arch/x86/kernel/mmconf-fam10h_64.c                 |    8 +
 arch/x86/kernel/mpparse-xen.c                      |  962 +++++
 arch/x86/kernel/msr-xen.c                          |  337 ++
 arch/x86/kernel/nmi.c                              |   17 +-
 arch/x86/kernel/pci-dma-xen.c                      |  366 ++
 arch/x86/kernel/pci-nommu-xen.c                    |  114 +
 arch/x86/kernel/pcspeaker.c                        |    5 +
 arch/x86/kernel/probe_roms.c                       |    7 +-
 arch/x86/kernel/process-xen.c                      |  630 +++
 arch/x86/kernel/process_32-xen.c                   |  446 +++
 arch/x86/kernel/process_64-xen.c                   |  695 ++++
 arch/x86/kernel/quirks.c                           |   17 +-
 arch/x86/kernel/relocate_kernel_32.S               |   39 +-
 arch/x86/kernel/relocate_kernel_64.S               |   36 +-
 arch/x86/kernel/resource.c                         |    8 +
 arch/x86/kernel/rtc.c                              |    9 +
 arch/x86/kernel/setup-xen.c                        | 1477 +++++++
 arch/x86/kernel/setup_percpu.c                     |    4 +
 arch/x86/kernel/smp-xen.c                          |  237 ++
 arch/x86/kernel/syscall_32-xen.c                   |   20 +
 arch/x86/kernel/time-xen.c                         |  626 +++
 arch/x86/kernel/traps-xen.c                        |  747 ++++
 arch/x86/kernel/vm86_32.c                          |   12 +
 arch/x86/kernel/vmlinux.lds.S                      |   10 +-
 arch/x86/kernel/vsyscall_64-xen.c                  |  363 ++
 arch/x86/kernel/x8664_ksyms_64.c                   |    2 +-
 arch/x86/kernel/x86_init-xen.c                     |  107 +
 arch/x86/kvm/Kconfig                               |    1 +
 arch/x86/lib/Makefile                              |    3 +
 arch/x86/lib/cache-smp-xen.c                       |   27 +
 arch/x86/lib/memset_64-xen.S                       |  154 +
 arch/x86/lib/scrub.c                               |   21 +
 arch/x86/mm/Makefile                               |    3 +
 arch/x86/mm/dump_pagetables-xen.c                  |  392 ++
 arch/x86/mm/fault-xen.c                            | 1241 ++++++
 arch/x86/mm/highmem_32-xen.c                       |  196 +
 arch/x86/mm/hypervisor.c                           | 1314 ++++++
 arch/x86/mm/init-xen.c                             |  503 +++
 arch/x86/mm/init_32-xen.c                          | 1019 +++++
 arch/x86/mm/init_64-xen.c                          | 1379 +++++++
 arch/x86/mm/iomap_32-xen.c                         |  121 +
 arch/x86/mm/ioremap-xen.c                          |  827 ++++
 arch/x86/mm/pageattr-xen.c                         | 1545 +++++++
 arch/x86/mm/pat-xen.c                              |  840 ++++
 arch/x86/mm/pat_internal.h                         |    4 +
 arch/x86/mm/pgtable-xen.c                          |  970 +++++
 arch/x86/mm/pgtable_32-xen.c                       |  179 +
 arch/x86/mm/physaddr.c                             |    4 +
 arch/x86/oprofile/Makefile                         |    7 +
 arch/x86/oprofile/xenoprof.c                       |  179 +
 arch/x86/pci/Makefile                              |    3 +
 arch/x86/pci/amd_bus.c                             |   15 +
 arch/x86/pci/i386.c                                |    2 +
 arch/x86/pci/irq.c                                 |    9 +-
 arch/x86/pci/mmconfig-shared.c                     |   29 +
 arch/x86/pci/pcifront.c                            |   59 +
 arch/x86/platform/efi/Makefile                     |    1 +
 arch/x86/platform/efi/efi-xen.c                    |  507 +++
 arch/x86/platform/sfi/sfi.c                        |    7 +
 arch/x86/power/Makefile                            |    2 +
 arch/x86/vdso/Makefile                             |    1 +
 arch/x86/vdso/vclock_gettime.c                     |   13 +-
 arch/x86/vdso/vdso32-setup-xen.c                   |  491 +++
 arch/x86/vdso/vdso32.S                             |    2 +-
 arch/x86/vdso/vdso32/note.S                        |    6 +-
 arch/x86/vdso/vdso32/syscall.S                     |    2 +
 arch/x86/xen/Kconfig                               |   19 +-
 arch/x86/xen/enlighten.c                           |    8 +-
 arch/x86/xen/xen-head.S                            |    4 +-
 drivers/Makefile                                   |    3 +-
 drivers/acpi/Kconfig                               |   16 +-
 drivers/acpi/Makefile                              |    1 +
 drivers/acpi/acpi_memhotplug.c                     |   22 +
 drivers/acpi/acpica/hwsleep.c                      |   16 +
 drivers/acpi/osl.c                                 |    6 +-
 drivers/acpi/pci_irq.c                             |   77 +
 drivers/acpi/pci_root.c                            |   70 +
 drivers/acpi/processor_core.c                      |   51 +-
 drivers/acpi/processor_driver.c                    |  167 +-
 drivers/acpi/processor_extcntl.c                   |  214 +
 drivers/acpi/processor_idle.c                      |   42 +-
 drivers/acpi/processor_perflib.c                   |   31 +-
 drivers/acpi/scan.c                                |   25 +
 drivers/acpi/sleep.c                               |    2 +
 drivers/base/cpu.c                                 |    4 +-
 drivers/block/Kconfig                              |   10 +-
 drivers/block/Makefile                             |    4 +-
 drivers/block/floppy.c                             |    2 +
 drivers/block/xen-blkback/Makefile                 |    2 +-
 drivers/cdrom/Makefile                             |    1 +
 drivers/char/Kconfig                               |    2 +-
 drivers/char/agp/agp.h                             |    4 +
 drivers/char/agp/amd-k7-agp.c                      |    4 +-
 drivers/char/agp/amd64-agp.c                       |    4 +-
 drivers/char/agp/ati-agp.c                         |    4 +-
 drivers/char/agp/efficeon-agp.c                    |    2 +-
 drivers/char/agp/generic.c                         |    8 +-
 drivers/char/agp/intel-gtt.c                       |   18 +
 drivers/char/agp/sworks-agp.c                      |    6 +-
 drivers/char/mem.c                                 |   16 +
 drivers/char/tpm/Kconfig                           |    9 +
 drivers/char/tpm/Makefile                          |    2 +
 drivers/char/tpm/tpm.h                             |   15 +
 drivers/char/tpm/tpm_vtpm.c                        |  543 +++
 drivers/char/tpm/tpm_vtpm.h                        |   55 +
 drivers/char/tpm/tpm_xen.c                         |  720 ++++
 drivers/cpufreq/Kconfig                            |    1 +
 drivers/cpuidle/Kconfig                            |    1 +
 drivers/dma/Kconfig                                |    2 +-
 drivers/dma/ioat/Makefile                          |    3 +-
 drivers/dma/ioat/dca.c                             |   12 +
 drivers/dma/ioat/dma.h                             |   17 +
 drivers/dma/ioat/dma_v2.h                          |    6 +
 drivers/dma/ioat/hw.h                              |    4 +
 drivers/dma/ioat/pci.c                             |    7 +-
 drivers/edac/Kconfig                               |    7 +-
 drivers/edac/edac_mc.c                             |    4 +
 drivers/edac/i7core_edac.c                         |    2 +-
 drivers/edac/sb_edac.c                             |    4 +
 drivers/firmware/Kconfig                           |    1 +
 drivers/firmware/dcdbas.c                          |   28 +-
 drivers/firmware/dell_rbu.c                        |   45 +-
 drivers/firmware/dmi_scan.c                        |    5 +
 drivers/gpu/drm/i915/i915_drv.c                    |    2 +-
 drivers/gpu/drm/i915/i915_drv.h                    |    5 +
 drivers/gpu/drm/i915/i915_gem.c                    |   11 +
 drivers/gpu/drm/i915/intel_display.c               |    4 +
 drivers/gpu/drm/radeon/radeon_device.c             |   12 +
 drivers/gpu/drm/ttm/ttm_bo.c                       |    8 +
 drivers/gpu/drm/ttm/ttm_bo_vm.c                    |    6 +
 drivers/gpu/drm/ttm/ttm_page_alloc.c               |   29 +
 drivers/gpu/drm/vmwgfx/Kconfig                     |    2 +-
 drivers/hv/Kconfig                                 |    2 +-
 drivers/hwmon/Kconfig                              |    6 +-
 drivers/hwmon/coretemp-xen.c                       |  888 +++++
 drivers/hwmon/via-cputemp-xen.c                    |  397 ++
 drivers/ide/ide-lib.c                              |   11 +
 drivers/idle/Kconfig                               |    2 +-
 drivers/iommu/Kconfig                              |    1 +
 drivers/misc/Kconfig                               |    2 +-
 drivers/net/Kconfig                                |   14 +-
 drivers/net/Makefile                               |    4 +-
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c    |   10 +
 drivers/net/ethernet/chelsio/cxgb3/sge.c           |   62 +-
 drivers/net/ethernet/chelsio/cxgb3/version.h       |    4 +
 drivers/net/xen-netback/Makefile                   |    2 +-
 drivers/oprofile/buffer_sync.c                     |   81 +-
 drivers/oprofile/cpu_buffer.c                      |   73 +-
 drivers/oprofile/cpu_buffer.h                      |   12 +-
 drivers/oprofile/event_buffer.h                    |    3 +
 drivers/oprofile/oprof.c                           |   32 +
 drivers/oprofile/oprof.h                           |    3 +
 drivers/oprofile/oprofile_files.c                  |  145 +
 drivers/pci/Kconfig                                |   49 +-
 drivers/pci/Makefile                               |    7 +-
 drivers/pci/guestdev.c                             |  881 ++++
 drivers/pci/iomulti.c                              |  904 +++++
 drivers/pci/iomulti.h                              |  122 +
 drivers/pci/msi-xen.c                              |  838 ++++
 drivers/pci/pci-iomul.c                            |  440 ++
 drivers/pci/pci.c                                  |   12 +
 drivers/pci/pci.h                                  |   22 +
 drivers/pci/probe.c                                |   16 +-
 drivers/pci/reserve.c                              |  137 +
 drivers/pci/setup-bus.c                            |    8 +-
 drivers/rtc/Kconfig                                |    2 +-
 drivers/scsi/Kconfig                               |    2 +-
 drivers/scsi/arcmsr/arcmsr.h                       |    2 +-
 drivers/sfi/sfi_core.c                             |    5 +
 drivers/staging/zcache/zcache-main.c               |   10 +-
 drivers/tty/hvc/Kconfig                            |    2 +-
 drivers/tty/serial/8250/Kconfig                    |    1 +
 drivers/tty/tty_io.c                               |    9 +-
 drivers/video/Kconfig                              |    4 +-
 drivers/virtio/Kconfig                             |    1 +
 drivers/watchdog/Kconfig                           |    2 +-
 drivers/watchdog/xen_wdt.c                         |   32 +-
 drivers/xen/Kconfig                                |  506 ++-
 drivers/xen/Makefile                               |   57 +-
 drivers/xen/balloon/Makefile                       |    2 +
 drivers/xen/balloon/balloon.c                      |  804 ++++
 drivers/xen/balloon/common.h                       |   57 +
 drivers/xen/balloon/sysfs.c                        |  209 +
 drivers/xen/blkback/Makefile                       |    4 +
 drivers/xen/blkback/blkback-pagemap.c              |   97 +
 drivers/xen/blkback/blkback-pagemap.h              |   37 +
 drivers/xen/blkback/blkback.c                      |  760 ++++
 drivers/xen/blkback/cdrom.c                        |  154 +
 drivers/xen/blkback/common.h                       |  161 +
 drivers/xen/blkback/interface.c                    |  146 +
 drivers/xen/blkback/vbd.c                          |  212 +
 drivers/xen/blkback/xenbus.c                       |  662 +++
 drivers/xen/blkfront/Makefile                      |    5 +
 drivers/xen/blkfront/blkfront.c                    | 1150 ++++++
 drivers/xen/blkfront/block.h                       |  174 +
 drivers/xen/blkfront/vbd.c                         |  554 +++
 drivers/xen/blkfront/vcd.c                         |  496 +++
 drivers/xen/blktap/Makefile                        |    5 +
 drivers/xen/blktap/blktap.c                        | 1784 +++++++++
 drivers/xen/blktap/blocktap.c                      |    1 +
 drivers/xen/blktap/common.h                        |  112 +
 drivers/xen/blktap/interface.c                     |  140 +
 drivers/xen/blktap/xenbus.c                        |  517 +++
 drivers/xen/blktap2-new/Makefile                   |    4 +
 drivers/xen/blktap2-new/blktap.h                   |  218 +
 drivers/xen/blktap2-new/control.c                  |  316 ++
 drivers/xen/blktap2-new/device.c                   |  572 +++
 drivers/xen/blktap2-new/request.c                  |  418 ++
 drivers/xen/blktap2-new/ring.c                     |  547 +++
 drivers/xen/blktap2-new/sysfs.c                    |  299 ++
 drivers/xen/blktap2/Makefile                       |    4 +
 drivers/xen/blktap2/blktap.h                       |  264 ++
 drivers/xen/blktap2/control.c                      |  285 ++
 drivers/xen/blktap2/device.c                       | 1176 ++++++
 drivers/xen/blktap2/request.c                      |  296 ++
 drivers/xen/blktap2/ring.c                         |  610 +++
 drivers/xen/blktap2/sysfs.c                        |  475 +++
 drivers/xen/blktap2/wait_queue.c                   |   40 +
 drivers/xen/char/Makefile                          |    1 +
 drivers/xen/char/mem.c                             |  222 ++
 drivers/xen/console/Makefile                       |    2 +
 drivers/xen/console/console.c                      |  755 ++++
 drivers/xen/console/xencons_ring.c                 |  141 +
 drivers/xen/core/Makefile                          |   16 +
 drivers/xen/core/acpi_memhotplug.c                 |  190 +
 drivers/xen/core/clockevents.c                     |  293 ++
 drivers/xen/core/cpu_hotplug.c                     |  182 +
 drivers/xen/core/domctl.c                          |  562 +++
 drivers/xen/core/domctl.h                          |    4 +
 drivers/xen/core/evtchn.c                          | 1992 ++++++++++
 drivers/xen/core/firmware.c                        |   75 +
 drivers/xen/core/gnttab.c                          |  890 +++++
 drivers/xen/core/machine_kexec.c                   |  403 ++
 drivers/xen/core/machine_reboot.c                  |  285 ++
 drivers/xen/core/pcpu.c                            |  447 +++
 drivers/xen/core/reboot.c                          |  348 ++
 drivers/xen/core/smpboot.c                         |  398 ++
 drivers/xen/core/spinlock.c                        |  408 ++
 drivers/xen/core/xen_proc.c                        |   30 +
 drivers/xen/evtchn.c                               |   22 +-
 drivers/xen/fbfront/Makefile                       |    2 +
 drivers/xen/fbfront/xenfb.c                        |  910 +++++
 drivers/xen/fbfront/xenkbd.c                       |  366 ++
 drivers/xen/features.c                             |    9 +-
 drivers/xen/gntdev/Makefile                        |    1 +
 drivers/xen/gntdev/gntdev.c                        | 1012 +++++
 drivers/xen/netback/Makefile                       |    5 +
 drivers/xen/netback/accel.c                        |  269 ++
 drivers/xen/netback/common.h                       |  297 ++
 drivers/xen/netback/interface.c                    |  363 ++
 drivers/xen/netback/loopback.c                     |  278 ++
 drivers/xen/netback/netback.c                      | 1885 +++++++++
 drivers/xen/netback/xenbus.c                       |  494 +++
 drivers/xen/netfront/Makefile                      |    4 +
 drivers/xen/netfront/accel.c                       |  830 ++++
 drivers/xen/netfront/netfront.c                    | 2274 +++++++++++
 drivers/xen/netfront/netfront.h                    |  288 ++
 drivers/xen/pci.c                                  |   11 +
 drivers/xen/pcifront/Makefile                      |    5 +
 drivers/xen/pcifront/pci.c                         |   44 +
 drivers/xen/pcifront/pci_op.c                      |  671 ++++
 drivers/xen/pcifront/pcifront.h                    |   57 +
 drivers/xen/pcifront/xenbus.c                      |  476 +++
 drivers/xen/privcmd/Makefile                       |    3 +
 drivers/xen/privcmd/compat_privcmd.c               |  140 +
 drivers/xen/privcmd/privcmd.c                      |  470 +++
 drivers/xen/scsiback/Makefile                      |    4 +
 drivers/xen/scsiback/common.h                      |  173 +
 drivers/xen/scsiback/emulate.c                     |  480 +++
 drivers/xen/scsiback/interface.c                   |  141 +
 drivers/xen/scsiback/scsiback.c                    |  730 ++++
 drivers/xen/scsiback/translate.c                   |  168 +
 drivers/xen/scsiback/xenbus.c                      |  375 ++
 drivers/xen/scsifront/Makefile                     |    3 +
 drivers/xen/scsifront/common.h                     |  135 +
 drivers/xen/scsifront/scsifront.c                  |  478 +++
 drivers/xen/scsifront/xenbus.c                     |  424 ++
 drivers/xen/sfc_netback/Makefile                   |   12 +
 drivers/xen/sfc_netback/accel.c                    |  147 +
 drivers/xen/sfc_netback/accel.h                    |  392 ++
 drivers/xen/sfc_netback/accel_debugfs.c            |  148 +
 drivers/xen/sfc_netback/accel_fwd.c                |  420 ++
 drivers/xen/sfc_netback/accel_msg.c                |  391 ++
 drivers/xen/sfc_netback/accel_solarflare.c         | 1292 ++++++
 drivers/xen/sfc_netback/accel_solarflare.h         |   88 +
 drivers/xen/sfc_netback/accel_xenbus.c             |  831 ++++
 drivers/xen/sfc_netback/ci/compat.h                |   53 +
 drivers/xen/sfc_netback/ci/compat/gcc.h            |  158 +
 drivers/xen/sfc_netback/ci/compat/gcc_x86.h        |  115 +
 drivers/xen/sfc_netback/ci/compat/primitive.h      |   77 +
 drivers/xen/sfc_netback/ci/compat/sysdep.h         |  166 +
 drivers/xen/sfc_netback/ci/compat/utils.h          |  269 ++
 drivers/xen/sfc_netback/ci/compat/x86.h            |   48 +
 drivers/xen/sfc_netback/ci/compat/x86_64.h         |   54 +
 drivers/xen/sfc_netback/ci/tools/config.h          |   49 +
 drivers/xen/sfc_netback/ci/tools/debug.h           |  336 ++
 drivers/xen/sfc_netback/ci/tools/log.h             |  269 ++
 .../xen/sfc_netback/ci/tools/platform/gcc_x86.h    |  370 ++
 .../sfc_netback/ci/tools/platform/linux_kernel.h   |  361 ++
 drivers/xen/sfc_netback/ci/tools/sysdep.h          |  132 +
 drivers/xen/sfc_netfront/Makefile                  |   11 +
 drivers/xen/sfc_netfront/accel.h                   |  495 +++
 drivers/xen/sfc_netfront/accel_bufs.c              |  393 ++
 drivers/xen/sfc_netfront/accel_bufs.h              |  181 +
 drivers/xen/sfc_netfront/accel_debugfs.c           |  227 ++
 drivers/xen/sfc_netfront/accel_msg.c               |  567 +++
 drivers/xen/sfc_netfront/accel_netfront.c          |  330 ++
 drivers/xen/sfc_netfront/accel_ssr.c               |  308 ++
 drivers/xen/sfc_netfront/accel_ssr.h               |   88 +
 drivers/xen/sfc_netfront/accel_tso.c               |  509 +++
 drivers/xen/sfc_netfront/accel_tso.h               |   57 +
 drivers/xen/sfc_netfront/accel_vi.c                | 1203 ++++++
 drivers/xen/sfc_netfront/accel_xenbus.c            |  775 ++++
 drivers/xen/sfc_netfront/ef_vi_falcon.h            |  172 +
 drivers/xen/sfc_netfront/ef_vi_falcon_core.h       | 1075 +++++
 drivers/xen/sfc_netfront/ef_vi_falcon_desc.h       |   43 +
 drivers/xen/sfc_netfront/ef_vi_falcon_event.h      |  123 +
 drivers/xen/sfc_netfront/ef_vi_internal.h          |  256 ++
 drivers/xen/sfc_netfront/etherfabric/ef_vi.h       |  647 +++
 drivers/xen/sfc_netfront/falcon_event.c            |  346 ++
 drivers/xen/sfc_netfront/falcon_vi.c               |  473 +++
 drivers/xen/sfc_netfront/pt_tx.c                   |   91 +
 drivers/xen/sfc_netfront/sysdep.h                  |  185 +
 drivers/xen/sfc_netfront/vi_init.c                 |  183 +
 drivers/xen/sfc_netutil/Makefile                   |   11 +
 drivers/xen/sfc_netutil/accel_cuckoo_hash.c        |  649 +++
 drivers/xen/sfc_netutil/accel_cuckoo_hash.h        |  227 ++
 drivers/xen/sfc_netutil/accel_msg_iface.c          |  301 ++
 drivers/xen/sfc_netutil/accel_msg_iface.h          |  415 ++
 drivers/xen/sfc_netutil/accel_shared_fifo.h        |  127 +
 drivers/xen/sfc_netutil/accel_util.c               |  336 ++
 drivers/xen/sfc_netutil/accel_util.h               |  124 +
 drivers/xen/sys-hypervisor.c                       |   51 +-
 drivers/xen/tmem.c                                 |   45 +-
 drivers/xen/tpmback/Makefile                       |    4 +
 drivers/xen/tpmback/common.h                       |   93 +
 drivers/xen/tpmback/interface.c                    |  133 +
 drivers/xen/tpmback/tpmback.c                      |  947 +++++
 drivers/xen/tpmback/xenbus.c                       |  268 ++
 drivers/xen/usbback/Makefile                       |    4 +
 drivers/xen/usbback/interface.c                    |  190 +
 drivers/xen/usbback/usbback.c                      | 1198 ++++++
 drivers/xen/usbback/usbback.h                      |  170 +
 drivers/xen/usbback/usbstub.c                      |  324 ++
 drivers/xen/usbback/xenbus.c                       |  334 ++
 drivers/xen/usbfront/Makefile                      |   11 +
 drivers/xen/usbfront/usbfront-dbg.c                |  101 +
 drivers/xen/usbfront/usbfront-hcd.c                |  232 ++
 drivers/xen/usbfront/usbfront-hub.c                |  471 +++
 drivers/xen/usbfront/usbfront-q.c                  |  542 +++
 drivers/xen/usbfront/usbfront.h                    |  197 +
 drivers/xen/usbfront/xenbus.c                      |  414 ++
 drivers/xen/util.c                                 |   74 +
 drivers/xen/xen-pciback/Makefile                   |   16 +-
 drivers/xen/xen-pciback/conf_space_capability.c    |   15 +
 drivers/xen/xen-pciback/conf_space_header.c        |    9 +-
 drivers/xen/xen-pciback/controller.c               |  450 +++
 drivers/xen/xen-pciback/pci_stub.c                 |   53 +
 drivers/xen/xen-pciback/pciback.h                  |   21 +-
 drivers/xen/xen-pciback/pciback_ops.c              |   52 +
 drivers/xen/xen-pciback/slot.c                     |  200 +
 drivers/xen/xen-pciback/xenbus.c                   |  122 +-
 drivers/xen/xen-selfballoon.c                      |    5 +-
 drivers/xen/xenbus/Makefile                        |   23 +-
 drivers/xen/xenbus/xenbus_backend_client.c         |  106 +
 drivers/xen/xenbus/xenbus_client.c                 |  113 +-
 drivers/xen/xenbus/xenbus_comms.c                  |   71 +-
 drivers/xen/xenbus/xenbus_comms.h                  |   23 +
 drivers/xen/xenbus/xenbus_dev.c                    |  511 +++
 drivers/xen/xenbus/xenbus_dev_backend.c            |    4 +-
 drivers/xen/xenbus/xenbus_probe.c                  |  951 ++++-
 drivers/xen/xenbus/xenbus_probe.h                  |   30 +
 drivers/xen/xenbus/xenbus_probe_backend.c          |   82 +
 drivers/xen/xenbus/xenbus_xs.c                     |  146 +-
 drivers/xen/xenoprof/xenoprofile.c                 |  585 +++
 fs/Kconfig                                         |    1 +
 fs/aio.c                                           |  123 +-
 fs/block_dev.c                                     |    2 +-
 fs/compat_ioctl.c                                  |   23 +
 fs/proc/kcore.c                                    |    6 +-
 fs/super.c                                         |    2 +-
 include/acpi/processor.h                           |  148 +
 include/linux/acpi.h                               |    2 +
 include/linux/aio.h                                |    6 +
 include/linux/cleancache.h                         |   24 +-
 include/linux/console.h                            |    1 +
 include/linux/cpufreq.h                            |    2 +-
 include/linux/efi.h                                |    4 +
 include/linux/elfnote.h                            |    2 +-
 include/linux/frontswap.h                          |  126 +
 include/linux/highmem.h                            |    6 +
 include/linux/interrupt.h                          |    5 +
 include/linux/kexec.h                              |   20 +
 include/linux/mm.h                                 |   22 +
 include/linux/nmi.h                                |    3 +
 include/linux/oprofile.h                           |   20 +-
 include/linux/page-flags.h                         |   46 +-
 include/linux/pci.h                                |   15 +-
 include/linux/swap.h                               |    4 +
 include/linux/swapfile.h                           |   13 +
 include/linux/sysctl.h                             |    1 +
 include/linux/vermagic.h                           |    7 +-
 include/xen/Kbuild                                 |    3 +-
 include/xen/balloon.h                              |   67 +-
 include/xen/blkif.h                                |  159 +
 include/xen/clock.h                                |   18 +
 include/xen/compat_ioctl.h                         |   75 +
 include/xen/cpu_hotplug.h                          |   39 +
 include/xen/driver_util.h                          |   14 +
 include/xen/evtchn.h                               |  220 +-
 include/xen/features.h                             |    3 +-
 include/xen/firmware.h                             |   14 +
 include/xen/gntdev.h                               |  153 +-
 include/xen/gnttab.h                               |  207 +
 include/xen/hvm.h                                  |    5 +-
 include/xen/hypercall.h                            |   30 +
 include/xen/interface/COPYING                      |   38 +
 include/xen/interface/arch-x86/cpuid.h             |   68 +
 include/xen/interface/arch-x86/hvm/save.h          |  583 +++
 include/xen/interface/arch-x86/xen-mca.h           |  440 ++
 include/xen/interface/arch-x86/xen-x86_32.h        |  171 +
 include/xen/interface/arch-x86/xen-x86_64.h        |  202 +
 include/xen/interface/arch-x86/xen.h               |  210 +
 include/xen/interface/arch-x86_32.h                |   27 +
 include/xen/interface/arch-x86_64.h                |   27 +
 include/xen/interface/callback.h                   |    9 +
 include/xen/interface/dom0_ops.h                   |  120 +
 include/xen/interface/domctl.h                     | 1013 +++++
 include/xen/interface/elfnote.h                    |  118 +-
 include/xen/interface/event_channel.h              |   74 +-
 include/xen/interface/features.h                   |   32 +-
 include/xen/interface/grant_table.h                |  110 +-
 include/xen/interface/hvm/e820.h                   |   34 +
 include/xen/interface/hvm/hvm_info_table.h         |   72 +
 include/xen/interface/hvm/hvm_op.h                 |  227 +-
 include/xen/interface/hvm/ioreq.h                  |  140 +
 include/xen/interface/hvm/params.h                 |   56 +-
 include/xen/interface/hvm/save.h                   |  111 +
 include/xen/interface/io/blkif.h                   |  138 +-
 include/xen/interface/io/cdromif.h                 |  120 +
 include/xen/interface/io/console.h                 |   18 +
 include/xen/interface/io/fbif.h                    |   26 +-
 include/xen/interface/io/fsif.h                    |  192 +
 include/xen/interface/io/libxenvchan.h             |   97 +
 include/xen/interface/io/netif.h                   |   71 +-
 include/xen/interface/io/protocols.h               |   22 +
 include/xen/interface/io/ring.h                    |   94 +-
 include/xen/interface/io/tpmif.h                   |   77 +
 include/xen/interface/io/usbif.h                   |  151 +
 include/xen/interface/io/vscsiif.h                 |  105 +
 include/xen/interface/io/xenbus.h                  |    1 +
 include/xen/interface/io/xs_wire.h                 |   35 +-
 include/xen/interface/kexec.h                      |  168 +
 include/xen/interface/mem_event.h                  |   90 +
 include/xen/interface/memory.h                     |  160 +-
 include/xen/interface/nmi.h                        |   80 +
 include/xen/interface/physdev.h                    |   93 +-
 include/xen/interface/platform.h                   |  268 +-
 include/xen/interface/sched.h                      |   32 +-
 include/xen/interface/sysctl.h                     |  640 +++
 include/xen/interface/tmem.h                       |  148 +
 include/xen/interface/trace.h                      |  245 ++
 include/xen/interface/vcpu.h                       |   62 +-
 include/xen/interface/version.h                    |   44 +-
 include/xen/interface/xen-compat.h                 |   44 +
 include/xen/interface/xen.h                        |  537 ++-
 include/xen/interface/xenoprof.h                   |  152 +
 include/xen/interface/xsm/acm.h                    |  223 ++
 include/xen/interface/xsm/acm_ops.h                |  159 +
 include/xen/interface/xsm/flask_op.h               |  184 +
 include/xen/net-util.h                             |   75 +
 include/xen/pcifront.h                             |   69 +
 include/xen/pcpu.h                                 |   19 +
 include/xen/privcmd.h                              |   80 +-
 include/xen/public/Kbuild                          |    5 +
 include/xen/public/evtchn.h                        |   88 +
 include/xen/public/gntdev.h                        |  150 +
 include/xen/public/iomulti.h                       |   50 +
 include/xen/public/privcmd.h                       |   86 +
 include/xen/public/xenbus.h                        |   52 +
 include/xen/sysctl.h                               |   11 +
 include/xen/xen.h                                  |    6 +-
 include/xen/xen_proc.h                             |   12 +
 include/xen/xenbus.h                               |  144 +-
 include/xen/xencons.h                              |   17 +
 include/xen/xenoprof.h                             |   42 +
 kernel/Kconfig.preempt                             |    1 +
 kernel/irq/spurious.c                              |    2 +-
 kernel/kexec.c                                     |   96 +-
 kernel/ksysfs.c                                    |    4 +
 kernel/power/Kconfig                               |    4 +-
 kernel/sched/core.c                                |   62 +-
 kernel/sysctl.c                                    |    2 +-
 kernel/sysctl_binary.c                             |   12 +
 kernel/time/timekeeping.c                          |    6 +
 lib/swiotlb-xen.c                                  |  808 ++++
 mm/Kconfig                                         |   19 +-
 mm/Makefile                                        |    1 +
 mm/cleancache.c                                    |   98 +-
 mm/filemap.c                                       |    2 +-
 mm/frontswap.c                                     |  272 ++
 mm/init-mm.c                                       |    4 +
 mm/memory.c                                        |   46 +-
 mm/mmap.c                                          |   14 +
 mm/page_alloc.c                                    |   29 +
 mm/page_io.c                                       |   12 +
 mm/swapfile.c                                      |   64 +-
 mm/tmem-xen.c                                      |   56 +
 mm/truncate.c                                      |   10 +-
 mm/vmalloc.c                                       |   30 +
 net/ipv6/addrconf.c                                |    2 +
 scripts/Makefile.build                             |   15 +
 scripts/Makefile.lib                               |    6 +
 scripts/Makefile.xen.awk                           |   34 +
 665 files changed, 124049 insertions(+), 1515 deletions(-)
 delete mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cleancache
 create mode 100644 Documentation/vm/frontswap.txt
 create mode 100644 arch/x86/ia32/ia32entry-xen.S
 create mode 100644 arch/x86/include/mach-xen/asm/agp.h
 create mode 100644 arch/x86/include/mach-xen/asm/cmpxchg.h
 create mode 100644 arch/x86/include/mach-xen/asm/cmpxchg_32.h
 create mode 100644 arch/x86/include/mach-xen/asm/cmpxchg_64.h
 create mode 100644 arch/x86/include/mach-xen/asm/desc.h
 create mode 100644 arch/x86/include/mach-xen/asm/dma-mapping.h
 create mode 100644 arch/x86/include/mach-xen/asm/fixmap.h
 create mode 100644 arch/x86/include/mach-xen/asm/gnttab_dma.h
 create mode 100644 arch/x86/include/mach-xen/asm/highmem.h
 create mode 100644 arch/x86/include/mach-xen/asm/hypercall.h
 create mode 100644 arch/x86/include/mach-xen/asm/hypercall_32.h
 create mode 100644 arch/x86/include/mach-xen/asm/hypercall_64.h
 create mode 100644 arch/x86/include/mach-xen/asm/hypervisor.h
 create mode 100644 arch/x86/include/mach-xen/asm/i387.h
 create mode 100644 arch/x86/include/mach-xen/asm/io.h
 create mode 100644 arch/x86/include/mach-xen/asm/ipi.h
 create mode 100644 arch/x86/include/mach-xen/asm/irq_vectors.h
 create mode 100644 arch/x86/include/mach-xen/asm/irqflags.h
 create mode 100644 arch/x86/include/mach-xen/asm/mach_traps.h
 create mode 100644 arch/x86/include/mach-xen/asm/maddr.h
 create mode 100644 arch/x86/include/mach-xen/asm/maddr_32.h
 create mode 100644 arch/x86/include/mach-xen/asm/maddr_64.h
 create mode 100644 arch/x86/include/mach-xen/asm/mmu_context.h
 create mode 100644 arch/x86/include/mach-xen/asm/mutex.h
 create mode 100644 arch/x86/include/mach-xen/asm/pci.h
 create mode 100644 arch/x86/include/mach-xen/asm/percpu.h
 create mode 100644 arch/x86/include/mach-xen/asm/perf_event.h
 create mode 100644 arch/x86/include/mach-xen/asm/pgalloc.h
 create mode 100644 arch/x86/include/mach-xen/asm/pgtable-3level.h
 create mode 100644 arch/x86/include/mach-xen/asm/pgtable-3level_types.h
 create mode 100644 arch/x86/include/mach-xen/asm/pgtable.h
 create mode 100644 arch/x86/include/mach-xen/asm/pgtable_32.h
 create mode 100644 arch/x86/include/mach-xen/asm/pgtable_64.h
 create mode 100644 arch/x86/include/mach-xen/asm/pgtable_64_types.h
 create mode 100644 arch/x86/include/mach-xen/asm/pgtable_types.h
 create mode 100644 arch/x86/include/mach-xen/asm/probe_roms.h
 create mode 100644 arch/x86/include/mach-xen/asm/processor.h
 create mode 100644 arch/x86/include/mach-xen/asm/setup.h
 create mode 100644 arch/x86/include/mach-xen/asm/smp-processor-id.h
 create mode 100644 arch/x86/include/mach-xen/asm/smp.h
 create mode 100644 arch/x86/include/mach-xen/asm/spinlock.h
 create mode 100644 arch/x86/include/mach-xen/asm/spinlock_types.h
 create mode 100644 arch/x86/include/mach-xen/asm/swiotlb.h
 create mode 100644 arch/x86/include/mach-xen/asm/system.h
 create mode 100644 arch/x86/include/mach-xen/asm/time.h
 create mode 100644 arch/x86/include/mach-xen/asm/tlbflush.h
 create mode 100644 arch/x86/include/mach-xen/asm/vga.h
 create mode 100644 arch/x86/include/mach-xen/asm/xenoprof.h
 create mode 100644 arch/x86/include/mach-xen/asm/xor.h
 create mode 100644 arch/x86/include/mach-xen/asm/xor_64.h
 create mode 100644 arch/x86/kernel/acpi/processor_extcntl_xen.c
 create mode 100644 arch/x86/kernel/apic/apic-xen.c
 create mode 100644 arch/x86/kernel/apic/io_apic-xen.c
 create mode 100644 arch/x86/kernel/apic/ipi-xen.c
 create mode 100644 arch/x86/kernel/apic/probe_32-xen.c
 create mode 100644 arch/x86/kernel/cpu/common-xen.c
 create mode 100644 arch/x86/kernel/cpu/mcheck/mce_dom0.c
 create mode 100644 arch/x86/kernel/cpu/mtrr/main-xen.c
 create mode 100644 arch/x86/kernel/e820-xen.c
 create mode 100644 arch/x86/kernel/early_printk-xen.c
 create mode 100644 arch/x86/kernel/entry_32-xen.S
 create mode 100644 arch/x86/kernel/entry_64-xen.S
 create mode 100644 arch/x86/kernel/fixup.c
 create mode 100644 arch/x86/kernel/head-xen.c
 create mode 100644 arch/x86/kernel/head32-xen.c
 create mode 100644 arch/x86/kernel/head64-xen.c
 create mode 100644 arch/x86/kernel/head_32-xen.S
 create mode 100644 arch/x86/kernel/head_64-xen.S
 create mode 100644 arch/x86/kernel/ioport-xen.c
 create mode 100644 arch/x86/kernel/irq-xen.c
 create mode 100644 arch/x86/kernel/irq_work-xen.c
 create mode 100644 arch/x86/kernel/ldt-xen.c
 create mode 100644 arch/x86/kernel/machine_kexec_xen.c
 create mode 100644 arch/x86/kernel/microcode_core-xen.c
 create mode 100644 arch/x86/kernel/mpparse-xen.c
 create mode 100644 arch/x86/kernel/msr-xen.c
 create mode 100644 arch/x86/kernel/pci-dma-xen.c
 create mode 100644 arch/x86/kernel/pci-nommu-xen.c
 create mode 100644 arch/x86/kernel/process-xen.c
 create mode 100644 arch/x86/kernel/process_32-xen.c
 create mode 100644 arch/x86/kernel/process_64-xen.c
 create mode 100644 arch/x86/kernel/setup-xen.c
 create mode 100644 arch/x86/kernel/smp-xen.c
 create mode 100644 arch/x86/kernel/syscall_32-xen.c
 create mode 100644 arch/x86/kernel/time-xen.c
 create mode 100644 arch/x86/kernel/traps-xen.c
 create mode 100644 arch/x86/kernel/vsyscall_64-xen.c
 create mode 100644 arch/x86/kernel/x86_init-xen.c
 create mode 100644 arch/x86/lib/cache-smp-xen.c
 create mode 100644 arch/x86/lib/memset_64-xen.S
 create mode 100644 arch/x86/lib/scrub.c
 create mode 100644 arch/x86/mm/dump_pagetables-xen.c
 create mode 100644 arch/x86/mm/fault-xen.c
 create mode 100644 arch/x86/mm/highmem_32-xen.c
 create mode 100644 arch/x86/mm/hypervisor.c
 create mode 100644 arch/x86/mm/init-xen.c
 create mode 100644 arch/x86/mm/init_32-xen.c
 create mode 100644 arch/x86/mm/init_64-xen.c
 create mode 100644 arch/x86/mm/iomap_32-xen.c
 create mode 100644 arch/x86/mm/ioremap-xen.c
 create mode 100644 arch/x86/mm/pageattr-xen.c
 create mode 100644 arch/x86/mm/pat-xen.c
 create mode 100644 arch/x86/mm/pgtable-xen.c
 create mode 100644 arch/x86/mm/pgtable_32-xen.c
 create mode 100644 arch/x86/oprofile/xenoprof.c
 create mode 100644 arch/x86/pci/pcifront.c
 create mode 100644 arch/x86/platform/efi/efi-xen.c
 create mode 100644 arch/x86/vdso/vdso32-setup-xen.c
 create mode 100644 drivers/acpi/processor_extcntl.c
 create mode 100644 drivers/char/tpm/tpm_vtpm.c
 create mode 100644 drivers/char/tpm/tpm_vtpm.h
 create mode 100644 drivers/char/tpm/tpm_xen.c
 create mode 100644 drivers/hwmon/coretemp-xen.c
 create mode 100644 drivers/hwmon/via-cputemp-xen.c
 create mode 100644 drivers/pci/guestdev.c
 create mode 100644 drivers/pci/iomulti.c
 create mode 100644 drivers/pci/iomulti.h
 create mode 100644 drivers/pci/msi-xen.c
 create mode 100644 drivers/pci/pci-iomul.c
 create mode 100644 drivers/pci/reserve.c
 create mode 100644 drivers/xen/balloon/Makefile
 create mode 100644 drivers/xen/balloon/balloon.c
 create mode 100644 drivers/xen/balloon/common.h
 create mode 100644 drivers/xen/balloon/sysfs.c
 create mode 100644 drivers/xen/blkback/Makefile
 create mode 100644 drivers/xen/blkback/blkback-pagemap.c
 create mode 100644 drivers/xen/blkback/blkback-pagemap.h
 create mode 100644 drivers/xen/blkback/blkback.c
 create mode 100644 drivers/xen/blkback/cdrom.c
 create mode 100644 drivers/xen/blkback/common.h
 create mode 100644 drivers/xen/blkback/interface.c
 create mode 100644 drivers/xen/blkback/vbd.c
 create mode 100644 drivers/xen/blkback/xenbus.c
 create mode 100644 drivers/xen/blkfront/Makefile
 create mode 100644 drivers/xen/blkfront/blkfront.c
 create mode 100644 drivers/xen/blkfront/block.h
 create mode 100644 drivers/xen/blkfront/vbd.c
 create mode 100644 drivers/xen/blkfront/vcd.c
 create mode 100644 drivers/xen/blktap/Makefile
 create mode 100644 drivers/xen/blktap/blktap.c
 create mode 100644 drivers/xen/blktap/blocktap.c
 create mode 100644 drivers/xen/blktap/common.h
 create mode 100644 drivers/xen/blktap/interface.c
 create mode 100644 drivers/xen/blktap/xenbus.c
 create mode 100644 drivers/xen/blktap2-new/Makefile
 create mode 100644 drivers/xen/blktap2-new/blktap.h
 create mode 100644 drivers/xen/blktap2-new/control.c
 create mode 100644 drivers/xen/blktap2-new/device.c
 create mode 100644 drivers/xen/blktap2-new/request.c
 create mode 100644 drivers/xen/blktap2-new/ring.c
 create mode 100644 drivers/xen/blktap2-new/sysfs.c
 create mode 100644 drivers/xen/blktap2/Makefile
 create mode 100644 drivers/xen/blktap2/blktap.h
 create mode 100644 drivers/xen/blktap2/control.c
 create mode 100644 drivers/xen/blktap2/device.c
 create mode 100644 drivers/xen/blktap2/request.c
 create mode 100644 drivers/xen/blktap2/ring.c
 create mode 100644 drivers/xen/blktap2/sysfs.c
 create mode 100644 drivers/xen/blktap2/wait_queue.c
 create mode 100644 drivers/xen/char/Makefile
 create mode 100644 drivers/xen/char/mem.c
 create mode 100644 drivers/xen/console/Makefile
 create mode 100644 drivers/xen/console/console.c
 create mode 100644 drivers/xen/console/xencons_ring.c
 create mode 100644 drivers/xen/core/Makefile
 create mode 100644 drivers/xen/core/acpi_memhotplug.c
 create mode 100644 drivers/xen/core/clockevents.c
 create mode 100644 drivers/xen/core/cpu_hotplug.c
 create mode 100644 drivers/xen/core/domctl.c
 create mode 100644 drivers/xen/core/domctl.h
 create mode 100644 drivers/xen/core/evtchn.c
 create mode 100644 drivers/xen/core/firmware.c
 create mode 100644 drivers/xen/core/gnttab.c
 create mode 100644 drivers/xen/core/machine_kexec.c
 create mode 100644 drivers/xen/core/machine_reboot.c
 create mode 100644 drivers/xen/core/pcpu.c
 create mode 100644 drivers/xen/core/reboot.c
 create mode 100644 drivers/xen/core/smpboot.c
 create mode 100644 drivers/xen/core/spinlock.c
 create mode 100644 drivers/xen/core/xen_proc.c
 create mode 100644 drivers/xen/fbfront/Makefile
 create mode 100644 drivers/xen/fbfront/xenfb.c
 create mode 100644 drivers/xen/fbfront/xenkbd.c
 create mode 100644 drivers/xen/gntdev/Makefile
 create mode 100644 drivers/xen/gntdev/gntdev.c
 create mode 100644 drivers/xen/netback/Makefile
 create mode 100644 drivers/xen/netback/accel.c
 create mode 100644 drivers/xen/netback/common.h
 create mode 100644 drivers/xen/netback/interface.c
 create mode 100644 drivers/xen/netback/loopback.c
 create mode 100644 drivers/xen/netback/netback.c
 create mode 100644 drivers/xen/netback/xenbus.c
 create mode 100644 drivers/xen/netfront/Makefile
 create mode 100644 drivers/xen/netfront/accel.c
 create mode 100644 drivers/xen/netfront/netfront.c
 create mode 100644 drivers/xen/netfront/netfront.h
 create mode 100644 drivers/xen/pcifront/Makefile
 create mode 100644 drivers/xen/pcifront/pci.c
 create mode 100644 drivers/xen/pcifront/pci_op.c
 create mode 100644 drivers/xen/pcifront/pcifront.h
 create mode 100644 drivers/xen/pcifront/xenbus.c
 create mode 100644 drivers/xen/privcmd/Makefile
 create mode 100644 drivers/xen/privcmd/compat_privcmd.c
 create mode 100644 drivers/xen/privcmd/privcmd.c
 create mode 100644 drivers/xen/scsiback/Makefile
 create mode 100644 drivers/xen/scsiback/common.h
 create mode 100644 drivers/xen/scsiback/emulate.c
 create mode 100644 drivers/xen/scsiback/interface.c
 create mode 100644 drivers/xen/scsiback/scsiback.c
 create mode 100644 drivers/xen/scsiback/translate.c
 create mode 100644 drivers/xen/scsiback/xenbus.c
 create mode 100644 drivers/xen/scsifront/Makefile
 create mode 100644 drivers/xen/scsifront/common.h
 create mode 100644 drivers/xen/scsifront/scsifront.c
 create mode 100644 drivers/xen/scsifront/xenbus.c
 create mode 100644 drivers/xen/sfc_netback/Makefile
 create mode 100644 drivers/xen/sfc_netback/accel.c
 create mode 100644 drivers/xen/sfc_netback/accel.h
 create mode 100644 drivers/xen/sfc_netback/accel_debugfs.c
 create mode 100644 drivers/xen/sfc_netback/accel_fwd.c
 create mode 100644 drivers/xen/sfc_netback/accel_msg.c
 create mode 100644 drivers/xen/sfc_netback/accel_solarflare.c
 create mode 100644 drivers/xen/sfc_netback/accel_solarflare.h
 create mode 100644 drivers/xen/sfc_netback/accel_xenbus.c
 create mode 100644 drivers/xen/sfc_netback/ci/compat.h
 create mode 100644 drivers/xen/sfc_netback/ci/compat/gcc.h
 create mode 100644 drivers/xen/sfc_netback/ci/compat/gcc_x86.h
 create mode 100644 drivers/xen/sfc_netback/ci/compat/primitive.h
 create mode 100644 drivers/xen/sfc_netback/ci/compat/sysdep.h
 create mode 100644 drivers/xen/sfc_netback/ci/compat/utils.h
 create mode 100644 drivers/xen/sfc_netback/ci/compat/x86.h
 create mode 100644 drivers/xen/sfc_netback/ci/compat/x86_64.h
 create mode 100644 drivers/xen/sfc_netback/ci/tools/config.h
 create mode 100644 drivers/xen/sfc_netback/ci/tools/debug.h
 create mode 100644 drivers/xen/sfc_netback/ci/tools/log.h
 create mode 100644 drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h
 create mode 100644 drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h
 create mode 100644 drivers/xen/sfc_netback/ci/tools/sysdep.h
 create mode 100644 drivers/xen/sfc_netfront/Makefile
 create mode 100644 drivers/xen/sfc_netfront/accel.h
 create mode 100644 drivers/xen/sfc_netfront/accel_bufs.c
 create mode 100644 drivers/xen/sfc_netfront/accel_bufs.h
 create mode 100644 drivers/xen/sfc_netfront/accel_debugfs.c
 create mode 100644 drivers/xen/sfc_netfront/accel_msg.c
 create mode 100644 drivers/xen/sfc_netfront/accel_netfront.c
 create mode 100644 drivers/xen/sfc_netfront/accel_ssr.c
 create mode 100644 drivers/xen/sfc_netfront/accel_ssr.h
 create mode 100644 drivers/xen/sfc_netfront/accel_tso.c
 create mode 100644 drivers/xen/sfc_netfront/accel_tso.h
 create mode 100644 drivers/xen/sfc_netfront/accel_vi.c
 create mode 100644 drivers/xen/sfc_netfront/accel_xenbus.c
 create mode 100644 drivers/xen/sfc_netfront/ef_vi_falcon.h
 create mode 100644 drivers/xen/sfc_netfront/ef_vi_falcon_core.h
 create mode 100644 drivers/xen/sfc_netfront/ef_vi_falcon_desc.h
 create mode 100644 drivers/xen/sfc_netfront/ef_vi_falcon_event.h
 create mode 100644 drivers/xen/sfc_netfront/ef_vi_internal.h
 create mode 100644 drivers/xen/sfc_netfront/etherfabric/ef_vi.h
 create mode 100644 drivers/xen/sfc_netfront/falcon_event.c
 create mode 100644 drivers/xen/sfc_netfront/falcon_vi.c
 create mode 100644 drivers/xen/sfc_netfront/pt_tx.c
 create mode 100644 drivers/xen/sfc_netfront/sysdep.h
 create mode 100644 drivers/xen/sfc_netfront/vi_init.c
 create mode 100644 drivers/xen/sfc_netutil/Makefile
 create mode 100644 drivers/xen/sfc_netutil/accel_cuckoo_hash.c
 create mode 100644 drivers/xen/sfc_netutil/accel_cuckoo_hash.h
 create mode 100644 drivers/xen/sfc_netutil/accel_msg_iface.c
 create mode 100644 drivers/xen/sfc_netutil/accel_msg_iface.h
 create mode 100644 drivers/xen/sfc_netutil/accel_shared_fifo.h
 create mode 100644 drivers/xen/sfc_netutil/accel_util.c
 create mode 100644 drivers/xen/sfc_netutil/accel_util.h
 create mode 100644 drivers/xen/tpmback/Makefile
 create mode 100644 drivers/xen/tpmback/common.h
 create mode 100644 drivers/xen/tpmback/interface.c
 create mode 100644 drivers/xen/tpmback/tpmback.c
 create mode 100644 drivers/xen/tpmback/xenbus.c
 create mode 100644 drivers/xen/usbback/Makefile
 create mode 100644 drivers/xen/usbback/interface.c
 create mode 100644 drivers/xen/usbback/usbback.c
 create mode 100644 drivers/xen/usbback/usbback.h
 create mode 100644 drivers/xen/usbback/usbstub.c
 create mode 100644 drivers/xen/usbback/xenbus.c
 create mode 100644 drivers/xen/usbfront/Makefile
 create mode 100644 drivers/xen/usbfront/usbfront-dbg.c
 create mode 100644 drivers/xen/usbfront/usbfront-hcd.c
 create mode 100644 drivers/xen/usbfront/usbfront-hub.c
 create mode 100644 drivers/xen/usbfront/usbfront-q.c
 create mode 100644 drivers/xen/usbfront/usbfront.h
 create mode 100644 drivers/xen/usbfront/xenbus.c
 create mode 100644 drivers/xen/util.c
 create mode 100644 drivers/xen/xen-pciback/controller.c
 create mode 100644 drivers/xen/xen-pciback/slot.c
 create mode 100644 drivers/xen/xenbus/xenbus_backend_client.c
 create mode 100644 drivers/xen/xenbus/xenbus_dev.c
 create mode 100644 drivers/xen/xenoprof/xenoprofile.c
 create mode 100644 include/linux/frontswap.h
 create mode 100644 include/linux/swapfile.h
 create mode 100644 include/xen/blkif.h
 create mode 100644 include/xen/clock.h
 create mode 100644 include/xen/compat_ioctl.h
 create mode 100644 include/xen/cpu_hotplug.h
 create mode 100644 include/xen/driver_util.h
 create mode 100644 include/xen/firmware.h
 create mode 100644 include/xen/gnttab.h
 create mode 100644 include/xen/hypercall.h
 create mode 100644 include/xen/interface/COPYING
 create mode 100644 include/xen/interface/arch-x86/cpuid.h
 create mode 100644 include/xen/interface/arch-x86/hvm/save.h
 create mode 100644 include/xen/interface/arch-x86/xen-mca.h
 create mode 100644 include/xen/interface/arch-x86/xen-x86_32.h
 create mode 100644 include/xen/interface/arch-x86/xen-x86_64.h
 create mode 100644 include/xen/interface/arch-x86/xen.h
 create mode 100644 include/xen/interface/arch-x86_32.h
 create mode 100644 include/xen/interface/arch-x86_64.h
 create mode 100644 include/xen/interface/dom0_ops.h
 create mode 100644 include/xen/interface/domctl.h
 create mode 100644 include/xen/interface/hvm/e820.h
 create mode 100644 include/xen/interface/hvm/hvm_info_table.h
 create mode 100644 include/xen/interface/hvm/ioreq.h
 create mode 100644 include/xen/interface/hvm/save.h
 create mode 100644 include/xen/interface/io/cdromif.h
 create mode 100644 include/xen/interface/io/fsif.h
 create mode 100644 include/xen/interface/io/libxenvchan.h
 create mode 100644 include/xen/interface/io/tpmif.h
 create mode 100644 include/xen/interface/io/usbif.h
 create mode 100644 include/xen/interface/io/vscsiif.h
 create mode 100644 include/xen/interface/kexec.h
 create mode 100644 include/xen/interface/mem_event.h
 create mode 100644 include/xen/interface/nmi.h
 create mode 100644 include/xen/interface/sysctl.h
 create mode 100644 include/xen/interface/tmem.h
 create mode 100644 include/xen/interface/trace.h
 create mode 100644 include/xen/interface/xen-compat.h
 create mode 100644 include/xen/interface/xenoprof.h
 create mode 100644 include/xen/interface/xsm/acm.h
 create mode 100644 include/xen/interface/xsm/acm_ops.h
 create mode 100644 include/xen/interface/xsm/flask_op.h
 create mode 100644 include/xen/net-util.h
 create mode 100644 include/xen/pcifront.h
 create mode 100644 include/xen/pcpu.h
 create mode 100644 include/xen/public/Kbuild
 create mode 100644 include/xen/public/evtchn.h
 create mode 100644 include/xen/public/gntdev.h
 create mode 100644 include/xen/public/iomulti.h
 create mode 100644 include/xen/public/privcmd.h
 create mode 100644 include/xen/public/xenbus.h
 create mode 100644 include/xen/sysctl.h
 create mode 100644 include/xen/xen_proc.h
 create mode 100644 include/xen/xencons.h
 create mode 100644 include/xen/xenoprof.h
 create mode 100644 lib/swiotlb-xen.c
 create mode 100644 mm/frontswap.c
 create mode 100644 mm/tmem-xen.c
 create mode 100644 scripts/Makefile.xen.awk

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cleancache b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
deleted file mode 100644
index 662ae64..0000000
--- a/Documentation/ABI/testing/sysfs-kernel-mm-cleancache
+++ /dev/null
@@ -1,11 +0,0 @@
-What:		/sys/kernel/mm/cleancache/
-Date:		April 2011
-Contact:	Dan Magenheimer <dan.magenheimer@oracle.com>
-Description:
-		/sys/kernel/mm/cleancache/ contains a number of files which
-		record a count of various cleancache operations
-		(sum across all filesystems):
-			succ_gets
-			failed_gets
-			puts
-			flushes
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fc5b6ba..7d084eb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -874,6 +874,24 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	gpt		[EFI] Forces disk with valid GPT signature but
 			invalid Protective MBR to be treated as GPT.
 
+	guestdev=	[PCI,ACPI,XEN]
+			Format: {<device path>|<sbdf>}][,{<device path>|<sbdf>}[,...]]
+			Format of device path: <hid>[:<uid>]-<dev>.<func>[-<dev>.<func>[,...]][+iomul]
+			Format of sbdf: [<segment>:]<bus>:<dev>.<func>[+iomul]
+			Specifies PCI device for guest domain.
+			If PCI-PCI bridge is specified, all PCI devices
+			behind PCI-PCI bridge are reserved.
+			+iomul means that this PCI function will share
+			IO ports with other +iomul functions under same
+			switch. NOTE: if +iomul is specfied, all the functions
+			of the device will share IO ports.
+
+	guestiomuldev=	[PCI,ACPI,XEN]
+			Format: [sbd][,<sbd>][,...]
+			Format of sbdf: [<segment>:]<bus>:<dev>
+			Note: function shouldn't be specified.
+			Specifies PCI device for IO port multiplexing driver.
+
 	hashdist=	[KNL,NUMA] Large hashes allocated during boot
 			are distributed across NUMA nodes.  Defaults on
 			for 64-bit NUMA, off otherwise.
@@ -2114,6 +2132,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 		realloc		reallocate PCI resources if allocations done by BIOS
 				are erroneous.
 
+	pci_reserve=	[PCI]
+			Format: [<sbdf>[+IO<size>][+MEM<size>]][,<sbdf>...]
+			Format of sbdf: [<segment>:]<bus>:<dev>.<func>
+			Specifies the least reserved io size or memory size
+			which is assigned to PCI bridge even when no child
+			pci device exists. This is useful with PCI hotplug.
+
 	pcie_aspm=	[PCIE] Forcibly enable or disable PCIe Active State Power
 			Management.
 		off	Disable ASPM.
@@ -2289,6 +2314,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Run specified binary instead of /init from the ramdisk,
 			used for early userspace startup. See initrd.
 
+	reassign_resources	[PCI,ACPI,XEN]
+			Use guestdev= parameter to reassign device's
+			resources, or specify =all here.
+
 	reboot=		[BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
 			Format: <reboot_mode>[,<reboot_mode2>[,...]]
 			See arch/*/kernel/reboot.c or arch/*/kernel/process.c
diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt
index 36c367c..850580e 100644
--- a/Documentation/vm/cleancache.txt
+++ b/Documentation/vm/cleancache.txt
@@ -46,10 +46,11 @@ a negative return value indicates failure.  A "put_page" will copy a
 the pool id, a file key, and a page index into the file.  (The combination
 of a pool id, a file key, and an index is sometimes called a "handle".)
 A "get_page" will copy the page, if found, from cleancache into kernel memory.
-A "flush_page" will ensure the page no longer is present in cleancache;
-a "flush_inode" will flush all pages associated with the specified file;
-and, when a filesystem is unmounted, a "flush_fs" will flush all pages in
-all files specified by the given pool id and also surrender the pool id.
+An "invalidate_page" will ensure the page no longer is present in cleancache;
+an "invalidate_inode" will invalidate all pages associated with the specified
+file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate
+all pages in all files specified by the given pool id and also surrender
+the pool id.
 
 An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache
 to treat the pool as shared using a 128-bit UUID as a key.  On systems
@@ -62,12 +63,12 @@ of the kernel (e.g. by "tools" that control cleancache).  Or a
 cleancache implementation can simply disable shared_init by always
 returning a negative value.
 
-If a get_page is successful on a non-shared pool, the page is flushed (thus
-making cleancache an "exclusive" cache).  On a shared pool, the page
-is NOT flushed on a successful get_page so that it remains accessible to
+If a get_page is successful on a non-shared pool, the page is invalidated
+(thus making cleancache an "exclusive" cache).  On a shared pool, the page
+is NOT invalidated on a successful get_page so that it remains accessible to
 other sharers.  The kernel is responsible for ensuring coherency between
 cleancache (shared or not), the page cache, and the filesystem, using
-cleancache flush operations as required.
+cleancache invalidate operations as required.
 
 Note that cleancache must enforce put-put-get coherency and get-get
 coherency.  For the former, if two puts are made to the same handle but
@@ -77,20 +78,20 @@ if a get for a given handle fails, subsequent gets for that handle will
 never succeed unless preceded by a successful put with that handle.
 
 Last, cleancache provides no SMP serialization guarantees; if two
-different Linux threads are simultaneously putting and flushing a page
+different Linux threads are simultaneously putting and invalidating a page
 with the same handle, the results are indeterminate.  Callers must
 lock the page to ensure serial behavior.
 
 CLEANCACHE PERFORMANCE METRICS
 
-Cleancache monitoring is done by sysfs files in the
-/sys/kernel/mm/cleancache directory.  The effectiveness of cleancache
+If properly configured, monitoring of cleancache is done via debugfs in
+the /sys/kernel/debug/cleancache directory.  The effectiveness of cleancache
 can be measured (across all filesystems) with:
 
 succ_gets	- number of gets that were successful
 failed_gets	- number of gets that failed
 puts		- number of puts attempted (all "succeed")
-flushes		- number of flushes attempted
+invalidates	- number of invalidates attempted
 
 A backend implementatation may provide additional metrics.
 
@@ -143,7 +144,7 @@ systems.
 
 The core hooks for cleancache in VFS are in most cases a single line
 and the minimum set are placed precisely where needed to maintain
-coherency (via cleancache_flush operations) between cleancache,
+coherency (via cleancache_invalidate operations) between cleancache,
 the page cache, and disk.  All hooks compile into nothingness if
 cleancache is config'ed off and turn into a function-pointer-
 compare-to-NULL if config'ed on but no backend claims the ops
@@ -184,15 +185,15 @@ or for real kernel-addressable RAM, it makes perfect sense for
 transcendent memory.
 
 4) Why is non-shared cleancache "exclusive"?  And where is the
-   page "flushed" after a "get"? (Minchan Kim)
+   page "invalidated" after a "get"? (Minchan Kim)
 
 The main reason is to free up space in transcendent memory and
-to avoid unnecessary cleancache_flush calls.  If you want inclusive,
+to avoid unnecessary cleancache_invalidate calls.  If you want inclusive,
 the page can be "put" immediately following the "get".  If
 put-after-get for inclusive becomes common, the interface could
-be easily extended to add a "get_no_flush" call.
+be easily extended to add a "get_no_invalidate" call.
 
-The flush is done by the cleancache backend implementation.
+The invalidate is done by the cleancache backend implementation.
 
 5) What's the performance impact?
 
@@ -222,7 +223,7 @@ Some points for a filesystem to consider:
   as tmpfs should not enable cleancache)
 - To ensure coherency/correctness, the FS must ensure that all
   file removal or truncation operations either go through VFS or
-  add hooks to do the equivalent cleancache "flush" operations
+  add hooks to do the equivalent cleancache "invalidate" operations
 - To ensure coherency/correctness, either inode numbers must
   be unique across the lifetime of the on-disk file OR the
   FS must provide an "encode_fh" function.
@@ -243,11 +244,11 @@ If cleancache would use the inode virtual address instead of
 inode/filehandle, the pool id could be eliminated.  But, this
 won't work because cleancache retains pagecache data pages
 persistently even when the inode has been pruned from the
-inode unused list, and only flushes the data page if the file
+inode unused list, and only invalidates the data page if the file
 gets removed/truncated.  So if cleancache used the inode kva,
 there would be potential coherency issues if/when the inode
 kva is reused for a different file.  Alternately, if cleancache
-flushed the pages when the inode kva was freed, much of the value
+invalidated the pages when the inode kva was freed, much of the value
 of cleancache would be lost because the cache of pages in cleanache
 is potentially much larger than the kernel pagecache and is most
 useful if the pages survive inode cache removal.
diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt
new file mode 100644
index 0000000..5a1a00c
--- /dev/null
+++ b/Documentation/vm/frontswap.txt
@@ -0,0 +1,210 @@
+Frontswap provides a "transcendent memory" interface for swap pages.
+In some environments, dramatic performance savings may be obtained because
+swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
+
+Frontswap is so named because it can be thought of as the opposite of
+a "backing" store for a swap device.  The storage is assumed to be
+a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
+to the requirements of transcendent memory (such as Xen's "tmem", or
+in-kernel compressed memory, aka "zcache", or future RAM-like devices);
+this pseudo-RAM device is not directly accessible or addressable by the
+kernel and is of unknown and possibly time-varying size.  The driver
+links itself to frontswap by calling frontswap_register_ops to set the
+frontswap_ops funcs appropriately and the functions it provides must
+conform to certain policies as follows:
+
+An "init" prepares the device to receive frontswap pages associated
+with the specified swap device number (aka "type").  A "put_page" will
+copy the page to transcendent memory and associate it with the type and
+offset associated with the page. A "get_page" will copy the page, if found,
+from transcendent memory into kernel memory, but will NOT remove the page
+from from transcendent memory.  An "invalidate_page" will remove the page
+from transcendent memory and an "invalidate_area" will remove ALL pages
+associated with the swap type (e.g., like swapoff) and notify the "device"
+to refuse further puts with that swap type.
+
+Once a page is successfully put, a matching get on the page will normally
+succeed.  So when the kernel finds itself in a situation where it needs
+to swap out a page, it first attempts to use frontswap.  If the put returns
+success, the data has been successfully saved to transcendent memory and
+a disk write and, if the data is later read back, a disk read are avoided.
+If a put returns failure, transcendent memory has rejected the data, and the
+page can be written to swap as usual.
+
+Note that if a page is put and the page already exists in transcendent memory
+(a "duplicate" put), either the put succeeds and the data is overwritten,
+or the put fails AND the page is invalidated.  This ensures stale data may
+never be obtained from frontswap.
+
+If properly configured, monitoring of frontswap is done via debugfs in
+the /sys/kernel/debug/frontswap directory.  The effectiveness of
+frontswap can be measured (across all swap devices) with:
+
+failed_puts	- how many put attempts have failed
+gets		- how many gets were attempted (all should succeed)
+succ_puts	- how many put attempts have succeeded
+invalidates	- how many invalidates were attempted
+
+A backend implementation may provide additional metrics.
+
+FAQ
+
+1) Where's the value?
+
+When a workload starts swapping, performance falls through the floor.
+Frontswap significantly increases performance in many such workloads by
+providing a clean, dynamic interface to read and write swap pages to
+"transcendent memory" that is otherwise not directly addressable to the kernel.
+This interface is ideal when data is transformed to a different form
+and size (such as with compression) or secretly moved (as might be
+useful for write-balancing for some RAM-like devices).  Swap pages (and
+evicted page-cache pages) are a great use for this kind of slower-than-RAM-
+but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and
+cleancache) interface to transcendent memory provides a nice way to read
+and write -- and indirectly "name" -- the pages.
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources acrosst the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to do
+it well with no kernel changes have essentially failed (except in some
+well-publicized special-case workloads).  Frontswap -- and cleancache --
+with a fairly small impact on the kernel, provides a huge amount
+of flexibility for more dynamic, flexible RAM multiplexing.
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "self-ballooning"), sudden unexpected
+memory pressure may result in swapping; frontswap allows those pages
+to be swapped to and from hypervisor RAM if overall host system memory
+conditions allow.
+
+2) Sure there may be performance advantages in some situations, but
+   what's the space/time overhead of frontswap?
+
+If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
+nothingness and the only overhead is a few extra bytes per swapon'ed
+swap device.  If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
+registers, there is one extra global variable compared to zero for
+every swap page read or written.  If CONFIG_FRONTSWAP is enabled
+AND a frontswap backend registers AND the backend fails every "put"
+request (i.e. provides no memory despite claiming it might),
+CPU overhead is still negligible -- and since every frontswap fail
+precedes a swap page write-to-disk, the system is highly likely
+to be I/O bound and using a small fraction of a percent of a CPU
+will be irrelevant anyway.
+
+As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
+registers, one bit is allocated for every swap page for every swap
+device that is swapon'd.  This is added to the EIGHT bits (which
+was sixteen until about 2.6.34) that the kernel already allocates
+for every swap page for every swap device that is swapon'd.  (Hugh
+Dickins has observed that frontswap could probably steal one of
+the existing eight bits, but let's worry about that minor optimization
+later.)  For very large swap disks (which are rare) on a standard
+4K pagesize, this is 1MB per 32GB swap.
+
+3) OK, how about a quick overview of what this frontswap patch does
+   in terms that a kernel hacker can grok?
+
+Let's assume that a frontswap "backend" has registered during
+kernel initialization; this registration indicates that this
+frontswap backend has access to some "memory" that is not directly
+accessible by the kernel.  Exactly how much memory it provides is
+entirely dynamic and random.
+
+Whenever a swap-device is swapon'd frontswap_init() is called,
+passing the swap device number (aka "type") as a parameter.
+This notifies frontswap to expect attempts to "put" swap pages
+associated with that number.
+
+Whenever the swap subsystem is readying a page to write to a swap
+device (c.f swap_writepage()), frontswap_put_page is called.  Frontswap
+consults with the frontswap backend and if the backend says it does NOT
+have room, frontswap_put_page returns -1 and the kernel swaps the page
+to the swap device as normal.  Note that the response from the frontswap
+backend is unpredictable to the kernel; it may choose to never accept a
+page, it could accept every ninth page, or it might accept every
+page.  But if the backend does accept a page, the data from the page
+has already been copied and associated with the type and offset,
+and the backend guarantees the persistence of the data.  In this case,
+frontswap sets a bit in the "frontswap_map" for the swap device
+corresponding to the page offset on the swap device to which it would
+otherwise have written the data.
+
+When the swap subsystem needs to swap-in a page (swap_readpage()),
+it first calls frontswap_get_page() which checks the frontswap_map to
+see if the page was earlier accepted by the frontswap backend.  If
+it was, the page of data is filled from the frontswap backend and
+the swap-in is complete.  If not, the normal swap-in code is
+executed to obtain the page of data from the real swap device.
+
+So every time the frontswap backend accepts a page, a swap device read
+and (potentially) a swap device write are replaced by a "frontswap backend
+put" and (possibly) a "frontswap backend get", which are presumably much
+faster.
+
+4) Can't frontswap be configured as a "special" swap device that is
+   just higher priority than any real swap device (e.g. like zswap)?
+
+No.  Recall that acceptance of any swap page by the frontswap
+backend is entirely unpredictable. This is critical to the definition
+of frontswap because it grants completely dynamic discretion to the
+backend.  But since any "put" might fail, there must always be a real
+slot on a real swap device to swap the page.  Thus frontswap must be
+implemented as a "shadow" to every swapon'd device with the potential
+capability of holding every page that the swap device might have held
+and the possibility that it might hold no pages at all.
+On the downside, this also means that frontswap cannot contain more
+pages than the total of swapon'd swap devices.  For example, if NO
+swap device is configured on some installation, frontswap is useless.
+
+Further, frontswap is entirely synchronous whereas a real swap
+device is, by definition, asynchronous and uses block I/O.  The
+block I/O layer is not only unnecessary, but may perform "optimizations"
+that are inappropriate for a RAM-oriented device including delaying
+the write of some pages for a significant amount of time.  Synchrony is
+required to ensure the dynamicity of the backend and to avoid thorny race
+conditions that would unnecessarily and greatly complicate frontswap
+and/or the block I/O subsystem.
+
+In a virtualized environment, the dynamicity allows the hypervisor
+(or host OS) to do "intelligent overcommit".  For example, it can
+choose to accept pages only until host-swapping might be imminent,
+then force guests to do their own swapping.  In zcache, "poorly"
+compressible pages can be rejected, where "poorly" can itself be defined
+dynamically depending on current memory constraints.
+
+5) Why this weird definition about "duplicate puts"?  If a page
+   has been previously successfully put, can't it always be
+   successfully overwritten?
+
+Nearly always it can, but no, sometimes it cannot.  Consider an example
+where data is compressed and the original 4K page has been compressed
+to 1K.  Now an attempt is made to overwrite the page with data that
+is non-compressible and so would take the entire 4K.  But the backend
+has no more space.  In this case, the put must be rejected.  Whenever
+frontswap rejects a put that would overwrite, it also must invalidate
+the old data and ensure that it is no longer accessible.  Since the
+swap subsystem then writes the new data to the read swap device,
+this is the correct course of action to ensure coherency.
+
+6) What is frontswap_shrink for?
+
+When the (non-frontswap) swap subsystem swaps out a page to a real
+swap device, that page is only taking up low-value pre-allocated disk
+space.  But if frontswap has placed a page in transcendent memory, that
+page may be taking up valuable real estate.  The frontswap_shrink
+routine allows code outside of the swap subsystem (such as Xen tmem
+or zcache or some future tmem backend) to force pages out of the memory
+managed by frontswap and back into kernel-addressable memory.
+
+7) Why does the frontswap patch create the new include file swapfile.h?
+
+The frontswap code depends on some swap-subsystem-internal data
+structures that have, over the years, moved back and forth between
+static and global.  This seemed a reasonable compromise:  Define
+them as global but declare them in a new include file that isn't
+included by the large number of source files that include swap.h.
+
+Dan Magenheimer, last updated September 12, 2011
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 9b32d0e1..12a0e6f 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -234,7 +234,7 @@ config IA64_HP_SIM
 config IA64_XEN_GUEST
 	bool "Xen guest"
 	select SWIOTLB
-	depends on XEN
+	depends on PARAVIRT_XEN
 	help
 	  Build a kernel that runs on Xen guest domain. At this moment only
 	  16KB page size in supported.
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index be7bfa1..342907d 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -55,7 +55,7 @@ core-$(CONFIG_IA64_XEN_GUEST)	+= arch/ia64/dig/
 core-$(CONFIG_IA64_SGI_SN2)	+= arch/ia64/sn/
 core-$(CONFIG_IA64_SGI_UV)	+= arch/ia64/uv/
 core-$(CONFIG_KVM) 		+= arch/ia64/kvm/
-core-$(CONFIG_XEN)		+= arch/ia64/xen/
+core-$(CONFIG_PARAVIRT_XEN)	+= arch/ia64/xen/
 
 drivers-$(CONFIG_PCI)		+= arch/ia64/pci/
 drivers-$(CONFIG_IA64_HP_SIM)	+= arch/ia64/hp/sim/
diff --git a/arch/ia64/include/asm/xen/hypervisor.h b/arch/ia64/include/asm/xen/hypervisor.h
index 67455c2..aacad12 100644
--- a/arch/ia64/include/asm/xen/hypervisor.h
+++ b/arch/ia64/include/asm/xen/hypervisor.h
@@ -34,13 +34,13 @@
 #define _ASM_IA64_XEN_HYPERVISOR_H
 
 #include <linux/err.h>
+#include <xen/xen.h>
+#ifdef CONFIG_PARAVIRT_XEN
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>	/* to compile feature.c */
 #include <xen/features.h>		/* to comiple xen-netfront.c */
-#include <xen/xen.h>
 #include <asm/xen/hypercall.h>
 
-#ifdef CONFIG_XEN
 extern struct shared_info *HYPERVISOR_shared_info;
 extern struct start_info *xen_start_info;
 
diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h
index fbb5198..d950667 100644
--- a/arch/ia64/include/asm/xen/interface.h
+++ b/arch/ia64/include/asm/xen/interface.h
@@ -56,30 +56,19 @@
 #ifndef _ASM_IA64_XEN_INTERFACE_H
 #define _ASM_IA64_XEN_INTERFACE_H
 
-#define __DEFINE_GUEST_HANDLE(name, type)	\
+#define __DEFINE_XEN_GUEST_HANDLE(name, type)	\
 	typedef struct { type *p; } __guest_handle_ ## name
 
 #define DEFINE_GUEST_HANDLE_STRUCT(name)	\
-	__DEFINE_GUEST_HANDLE(name, struct name)
-#define DEFINE_GUEST_HANDLE(name)	__DEFINE_GUEST_HANDLE(name, name)
-#define GUEST_HANDLE(name)		__guest_handle_ ## name
-#define GUEST_HANDLE_64(name)		GUEST_HANDLE(name)
+	__DEFINE_XEN_GUEST_HANDLE(name, struct name)
+#define DEFINE_XEN_GUEST_HANDLE(name)	__DEFINE_XEN_GUEST_HANDLE(name, name)
+#define XEN_GUEST_HANDLE(name)		__guest_handle_ ## name
+#define XEN_GUEST_HANDLE_64(name)	XEN_GUEST_HANDLE(name)
 #define set_xen_guest_handle(hnd, val)	do { (hnd).p = val; } while (0)
 
 #ifndef __ASSEMBLY__
-/* Guest handles for primitive C types. */
-__DEFINE_GUEST_HANDLE(uchar, unsigned char);
-__DEFINE_GUEST_HANDLE(uint, unsigned int);
-__DEFINE_GUEST_HANDLE(ulong, unsigned long);
-
-DEFINE_GUEST_HANDLE(char);
-DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(long);
-DEFINE_GUEST_HANDLE(void);
-DEFINE_GUEST_HANDLE(uint64_t);
-
+typedef unsigned long xen_ulong_t;
 typedef unsigned long xen_pfn_t;
-DEFINE_GUEST_HANDLE(xen_pfn_t);
 #define PRI_xen_pfn	"lx"
 #endif
 
@@ -91,7 +80,7 @@ DEFINE_GUEST_HANDLE(xen_pfn_t);
 /* Maximum number of virtual CPUs in multi-processor guests. */
 /* keep sizeof(struct shared_page) <= PAGE_SIZE.
  * this is checked in arch/ia64/xen/hypervisor.c. */
-#define MAX_VIRT_CPUS	64
+#define XEN_LEGACY_MAX_VCPUS 64
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index af56501..166ced4 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -290,7 +290,7 @@ void foo(void)
 	DEFINE(IA64_ITC_LASTCYCLE_OFFSET,
 		offsetof (struct itc_jitter_data_t, itc_lastcycle));
 
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 	BLANK();
 
 	DEFINE(XEN_NATIVE_ASM, XEN_NATIVE);
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 53c0ba0..ddde313 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -183,7 +183,7 @@ SECTIONS {
 		__start_gate_section = .;
 		*(.data..gate)
 		__stop_gate_section = .;
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 		. = ALIGN(PAGE_SIZE);
 		__xen_start_gate_section = .;
 		*(.data..gate.xen)
diff --git a/arch/ia64/xen/Kconfig b/arch/ia64/xen/Kconfig
index 515e082..14d8ac6 100644
--- a/arch/ia64/xen/Kconfig
+++ b/arch/ia64/xen/Kconfig
@@ -2,7 +2,7 @@
 # This Kconfig describes xen/ia64 options
 #
 
-config XEN
+config PARAVIRT_XEN
 	bool "Xen hypervisor support"
 	default y
 	depends on PARAVIRT && MCKINLEY && IA64_PAGE_SIZE_16KB && EXPERIMENTAL
@@ -16,10 +16,6 @@ config XEN
 	  Enable Xen hypervisor support.  Resulting kernel runs
 	  both as a guest OS on Xen and natively on hardware.
 
-config XEN_XENCOMM
-	depends on XEN
-	bool
-
 config NO_IDLE_HZ
-	depends on XEN
+	depends on PARAVIRT_XEN
 	bool
diff --git a/arch/ia64/xen/xcom_hcall.c b/arch/ia64/xen/xcom_hcall.c
index ccaf743..7690fc3 100644
--- a/arch/ia64/xen/xcom_hcall.c
+++ b/arch/ia64/xen/xcom_hcall.c
@@ -343,7 +343,7 @@ xencommize_memory_reservation(struct xencomm_mini *xc_area,
 int
 xencomm_hypercall_memory_op(unsigned int cmd, void *arg)
 {
-	GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} };
+	XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} };
 	struct xen_memory_reservation *xmr = NULL;
 	int rc;
 	struct xencomm_handle *desc;
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 0e9dec6..006ed9e 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -2,7 +2,7 @@
 obj-$(CONFIG_KVM) += kvm/
 
 # Xen paravirtualization support
-obj-$(CONFIG_XEN) += xen/
+obj-$(CONFIG_PARAVIRT_XEN) += xen/
 
 # lguest paravirtualization support
 obj-$(CONFIG_LGUEST_GUEST) += lguest/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ffcb80b..6e8abc5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -8,7 +8,7 @@ config 64BIT
 
 config X86_32
 	def_bool !64BIT
-	select CLKSRC_I8253
+	select CLKSRC_I8253 if !XEN
 
 config X86_64
 	def_bool 64BIT
@@ -20,7 +20,7 @@ config X86
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
 	select HAVE_OPROFILE
-	select HAVE_PCSPKR_PLATFORM
+	select HAVE_PCSPKR_PLATFORM if !XEN_UNPRIVILEGED_GUEST
 	select HAVE_PERF_EVENTS
 	select HAVE_IRQ_WORK
 	select HAVE_IOREMAP_PROT
@@ -42,8 +42,8 @@ config X86
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
 	select HAVE_SYSCALL_TRACEPOINTS
-	select HAVE_KVM
-	select HAVE_ARCH_KGDB
+	select HAVE_KVM if !XEN
+	select HAVE_ARCH_KGDB if !XEN
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -51,14 +51,14 @@ config X86
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_DMA_API_DEBUG
 	select HAVE_KERNEL_GZIP
-	select HAVE_KERNEL_BZIP2
-	select HAVE_KERNEL_LZMA
-	select HAVE_KERNEL_XZ
-	select HAVE_KERNEL_LZO
+	select HAVE_KERNEL_BZIP2 if !XEN
+	select HAVE_KERNEL_LZMA if !XEN
+	select HAVE_KERNEL_XZ if !XEN
+	select HAVE_KERNEL_LZO if !XEN
 	select HAVE_HW_BREAKPOINT
 	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS
-	select HAVE_PERF_EVENTS_NMI
+	select HAVE_PERF_EVENTS_NMI if !XEN
 	select ANON_INODES
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
 	select HAVE_CMPXCHG_LOCAL if !M386
@@ -79,7 +79,7 @@ config X86
 	select IRQ_FORCED_THREADING
 	select USE_GENERIC_SMP_HELPERS if SMP
 	select HAVE_BPF_JIT if (X86_64 && NET)
-	select CLKEVT_I8253
+	select CLKEVT_I8253 if !XEN
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
 
@@ -101,17 +101,19 @@ config GENERIC_CMOS_UPDATE
 
 config CLOCKSOURCE_WATCHDOG
 	def_bool y
+	depends on !XEN
 
 config GENERIC_CLOCKEVENTS
 	def_bool y
 
 config ARCH_CLOCKSOURCE_DATA
 	def_bool y
-	depends on X86_64
+	depends on X86_64 && !XEN
 
 config GENERIC_CLOCKEVENTS_BROADCAST
 	def_bool y
 	depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
+	depends on !XEN
 
 config LOCKDEP_SUPPORT
 	def_bool y
@@ -129,7 +131,7 @@ config SBUS
 	bool
 
 config NEED_DMA_MAP_STATE
-       def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG)
+       def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB)
 
 config NEED_SG_DMA_LENGTH
 	def_bool y
@@ -193,6 +195,7 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK
 
 config ARCH_HIBERNATION_POSSIBLE
 	def_bool y
+	depends on !XEN
 
 config ARCH_SUSPEND_POSSIBLE
 	def_bool y
@@ -225,7 +228,15 @@ config X86_64_SMP
 
 config X86_HT
 	def_bool y
-	depends on SMP
+	depends on SMP && !XEN
+
+config X86_NO_TSS
+	def_bool y
+	depends on XEN
+
+config X86_NO_IDT
+	def_bool y
+	depends on XEN
 
 config X86_32_LAZY_GS
 	def_bool y
@@ -241,7 +252,7 @@ config KTIME_SCALAR
 
 config ARCH_CPU_PROBE_RELEASE
 	def_bool y
-	depends on HOTPLUG_CPU
+	depends on HOTPLUG_CPU && !XEN
 
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
@@ -307,13 +318,22 @@ config X86_MPPARSE
 	  For old smp systems that do not have proper acpi support. Newer systems
 	  (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
 
+config X86_XEN
+	bool "Xen-compatible"
+	depends on X86_32
+	select XEN
+	select X86_PAE
+	help
+	  Choose this option if you plan to run this kernel on top of the
+	  Xen Hypervisor.
+
 config X86_BIGSMP
 	bool "Support for big SMP systems with more than 8 CPUs"
-	depends on X86_32 && SMP
+	depends on X86_32 && SMP && !XEN
 	---help---
 	  This option is needed for the systems that have more than 8 CPUs
 
-if X86_32
+if X86_32 && !XEN
 config X86_EXTENDED_PLATFORM
 	bool "Support for extended (non-PC) x86 platforms"
 	default y
@@ -336,7 +356,14 @@ config X86_EXTENDED_PLATFORM
 	  generic distribution kernel, say Y here - otherwise say N.
 endif
 
-if X86_64
+config X86_64_XEN
+	bool "Enable Xen compatible kernel"
+	depends on X86_64
+	select XEN
+	help
+	  This option will compile a kernel compatible with Xen hypervisor
+
+if X86_64 && !XEN
 config X86_EXTENDED_PLATFORM
 	bool "Support for extended (non-PC) x86 platforms"
 	default y
@@ -538,7 +565,7 @@ config X86_ES7000
 
 config X86_32_IRIS
 	tristate "Eurobraille/Iris poweroff module"
-	depends on X86_32
+	depends on X86_32 && !XEN
 	---help---
 	  The Iris machines from EuroBraille do not have APM or ACPI support
 	  to shut themselves down properly.  A special I/O sequence is
@@ -563,6 +590,7 @@ config SCHED_OMIT_FRAME_POINTER
 
 menuconfig PARAVIRT_GUEST
 	bool "Paravirtualized guest support"
+	depends on !XEN
 	---help---
 	  Say Y here to get to see options related to running Linux under
 	  various hypervisors.  This option alone does not add any kernel code.
@@ -643,6 +671,7 @@ config NO_BOOTMEM
 
 config MEMTEST
 	bool "Memtest"
+	depends on !XEN
 	---help---
 	  This option adds a kernel parameter 'memtest', which allows memtest
 	  to be set.
@@ -665,6 +694,7 @@ source "arch/x86/Kconfig.cpu"
 config HPET_TIMER
 	def_bool X86_64
 	prompt "HPET Timer Support" if X86_32
+	depends on !XEN
 	---help---
 	  Use the IA-PC HPET (High Precision Event Timer) to manage
 	  time in preference to the PIT and RTC, if a HPET is
@@ -702,6 +732,7 @@ config APB_TIMER
 config DMI
 	default y
 	bool "Enable DMI scanning" if EXPERT
+	depends on !XEN_UNPRIVILEGED_GUEST
 	---help---
 	  Enabled scanning of DMI to identify machine quirks. Say Y
 	  here unless you have verified that your setup is not
@@ -712,7 +743,7 @@ config GART_IOMMU
 	bool "GART IOMMU support" if EXPERT
 	default y
 	select SWIOTLB
-	depends on X86_64 && PCI && AMD_NB
+	depends on X86_64 && PCI && AMD_NB && !X86_64_XEN
 	---help---
 	  Support for full DMA access of devices with 32bit memory access only
 	  on systems with more than 3GB. This is usually needed for USB,
@@ -727,7 +758,7 @@ config GART_IOMMU
 config CALGARY_IOMMU
 	bool "IBM Calgary IOMMU support"
 	select SWIOTLB
-	depends on X86_64 && PCI && EXPERIMENTAL
+	depends on X86_64 && PCI && !X86_64_XEN && EXPERIMENTAL
 	---help---
 	  Support for hardware IOMMUs in IBM's xSeries x366 and x460
 	  systems. Needed to run systems with more than 3GB of memory
@@ -755,7 +786,8 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
-	def_bool y if X86_64
+	def_bool y if X86_64 || XEN
+	prompt "Software I/O TLB" if XEN_UNPRIVILEGED_GUEST && !XEN_PCIDEV_FRONTEND
 	---help---
 	  Support for software bounce buffers used on x86-64 systems
 	  which don't have a hardware IOMMU (e.g. the current generation
@@ -776,11 +808,12 @@ config MAXSMP
 
 config NR_CPUS
 	int "Maximum number of CPUs" if SMP && !MAXSMP
-	range 2 8 if SMP && X86_32 && !X86_BIGSMP
+	range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN
 	range 2 512 if SMP && !MAXSMP
 	default "1" if !SMP
 	default "4096" if MAXSMP
 	default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+	default "16" if X86_64_XEN
 	default "8" if SMP
 	---help---
 	  This allows you to specify the maximum number of CPUs which this
@@ -823,7 +856,7 @@ source "kernel/Kconfig.preempt"
 
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
-	depends on X86_32 && !SMP && !X86_32_NON_STANDARD
+	depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !XEN_UNPRIVILEGED_GUEST
 	---help---
 	  A local APIC (Advanced Programmable Interrupt Controller) is an
 	  integrated interrupt controller in the CPU. If you have a single-CPU
@@ -849,10 +882,12 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+	depends on !XEN_UNPRIVILEGED_GUEST
 
 config X86_IO_APIC
 	def_bool y
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
+	depends on !XEN_UNPRIVILEGED_GUEST
 
 config X86_VISWS_APIC
 	def_bool y
@@ -860,7 +895,7 @@ config X86_VISWS_APIC
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
-	depends on X86_IO_APIC
+	depends on X86_IO_APIC && !XEN
 	---help---
 	  This option enables a workaround that fixes a source of
 	  spurious interrupts. This is recommended when threaded
@@ -883,6 +918,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 
 config X86_MCE
 	bool "Machine Check / overheating reporting"
+	depends on !XEN_UNPRIVILEGED_GUEST
 	---help---
 	  Machine Check support allows the processor to notify the
 	  kernel if it detects a problem (e.g. overheating, data corruption).
@@ -892,7 +928,7 @@ config X86_MCE
 config X86_MCE_INTEL
 	def_bool y
 	prompt "Intel MCE features"
-	depends on X86_MCE && X86_LOCAL_APIC
+	depends on X86_MCE && X86_LOCAL_APIC && !XEN
 	---help---
 	   Additional support for intel specific MCE features such as
 	   the thermal monitor.
@@ -900,14 +936,14 @@ config X86_MCE_INTEL
 config X86_MCE_AMD
 	def_bool y
 	prompt "AMD MCE features"
-	depends on X86_MCE && X86_LOCAL_APIC
+	depends on X86_MCE && X86_LOCAL_APIC && !XEN
 	---help---
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.
 
 config X86_ANCIENT_MCE
 	bool "Support for old Pentium 5 / WinChip machine checks"
-	depends on X86_32 && X86_MCE
+	depends on X86_32 && X86_MCE && !XEN
 	---help---
 	  Include support for machine check handling on old Pentium 5 or WinChip
 	  systems. These typically need to be enabled explicitely on the command
@@ -925,6 +961,10 @@ config X86_MCE_INJECT
 	  If you don't know what a machine check is and you don't do kernel
 	  QA it is safe to say n.
 
+config X86_XEN_MCE
+	def_bool y
+	depends on XEN && X86_MCE
+
 config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
@@ -978,7 +1018,7 @@ config I8K
 
 config X86_REBOOTFIXUPS
 	bool "Enable X86 board specific fixups for reboot"
-	depends on X86_32
+	depends on X86_32 && !XEN
 	---help---
 	  This enables chipset and/or board specific fixups to be done
 	  in order to get reboot to work correctly. This is only needed on
@@ -995,6 +1035,7 @@ config X86_REBOOTFIXUPS
 
 config MICROCODE
 	tristate "/dev/cpu/microcode - microcode support"
+	depends on !XEN_UNPRIVILEGED_GUEST
 	select FW_LOADER
 	---help---
 	  If you say Y here, you will be able to update the microcode on
@@ -1013,7 +1054,7 @@ config MICROCODE
 
 config MICROCODE_INTEL
 	bool "Intel microcode patch loading support"
-	depends on MICROCODE
+	depends on MICROCODE && !XEN
 	default MICROCODE
 	select FW_LOADER
 	---help---
@@ -1026,7 +1067,7 @@ config MICROCODE_INTEL
 
 config MICROCODE_AMD
 	bool "AMD microcode patch loading support"
-	depends on MICROCODE
+	depends on MICROCODE && !XEN
 	select FW_LOADER
 	---help---
 	  If you select this option, microcode patch loading support for AMD
@@ -1038,6 +1079,7 @@ config MICROCODE_OLD_INTERFACE
 
 config X86_MSR
 	tristate "/dev/cpu/*/msr - Model-specific register support"
+	select XEN_DOMCTL if XEN_PRIVILEGED_GUEST
 	---help---
 	  This device gives privileged processes access to the x86
 	  Model-Specific Registers (MSRs).  It is a character device with
@@ -1055,7 +1097,7 @@ config X86_CPUID
 
 choice
 	prompt "High Memory Support"
-	default HIGHMEM64G if X86_NUMAQ
+	default HIGHMEM64G if X86_NUMAQ || XEN
 	default HIGHMEM4G
 	depends on X86_32
 
@@ -1098,7 +1140,7 @@ config NOHIGHMEM
 
 config HIGHMEM4G
 	bool "4GB"
-	depends on !X86_NUMAQ
+	depends on !X86_NUMAQ && !XEN
 	---help---
 	  Select this if you have a 32-bit processor and between 1 and 4
 	  gigabytes of physical RAM.
@@ -1174,12 +1216,12 @@ config ARCH_PHYS_ADDR_T_64BIT
 	def_bool X86_64 || X86_PAE
 
 config ARCH_DMA_ADDR_T_64BIT
-	def_bool X86_64 || HIGHMEM64G
+	def_bool X86_64 || XEN || HIGHMEM64G
 
 config DIRECT_GBPAGES
 	bool "Enable 1GB pages for kernel pagetables" if EXPERT
 	default y
-	depends on X86_64
+	depends on X86_64 && !XEN
 	---help---
 	  Allow the kernel linear mapping to use 1GB pages on CPUs that
 	  support it. This can improve the kernel's performance a tiny bit by
@@ -1188,7 +1230,7 @@ config DIRECT_GBPAGES
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
-	depends on SMP
+	depends on SMP && !XEN
 	depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
 	default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
 	---help---
@@ -1289,12 +1331,13 @@ config ARCH_DISCONTIGMEM_DEFAULT
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
 	depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
+	depends on !XEN
 	select SPARSEMEM_STATIC if X86_32
 	select SPARSEMEM_VMEMMAP_ENABLE if X86_64
 
 config ARCH_SPARSEMEM_DEFAULT
 	def_bool y
-	depends on X86_64
+	depends on X86_64 && !X86_64_XEN
 
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
@@ -1326,6 +1369,7 @@ config HIGHPTE
 
 config X86_CHECK_BIOS_CORRUPTION
 	bool "Check for low memory corruption"
+	depends on !XEN
 	---help---
 	  Periodically check for memory corruption in low memory, which
 	  is suspected to be caused by BIOS.  Even when enabled in the
@@ -1356,6 +1400,7 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
 
 config X86_RESERVE_LOW
 	int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+	depends on !XEN
 	default 64
 	range 4 640
 	---help---
@@ -1386,6 +1431,7 @@ config X86_RESERVE_LOW
 config MATH_EMULATION
 	bool
 	prompt "Math emulation" if X86_32
+	depends on !XEN
 	---help---
 	  Linux can emulate a math coprocessor (used for floating point
 	  operations) if you don't have one. 486DX and Pentium processors have
@@ -1412,6 +1458,7 @@ config MATH_EMULATION
 config MTRR
 	def_bool y
 	prompt "MTRR (Memory Type Range Register) support" if EXPERT
+	depends on !XEN_UNPRIVILEGED_GUEST
 	---help---
 	  On Intel P6 family processors (Pentium Pro, Pentium II and later)
 	  the Memory Type Range Registers (MTRRs) may be used to control
@@ -1447,7 +1494,7 @@ config MTRR
 config MTRR_SANITIZER
 	def_bool y
 	prompt "MTRR cleanup support"
-	depends on MTRR
+	depends on MTRR && !XEN
 	---help---
 	  Convert MTRR layout from continuous to discrete, so X drivers can
 	  add writeback entries.
@@ -1477,8 +1524,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
 
 config X86_PAT
 	def_bool y
-	prompt "x86 PAT support" if EXPERT
-	depends on MTRR
+	prompt "x86 PAT support" if EXPERT || XEN_UNPRIVILEGED_GUEST
+	depends on MTRR || (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
 	---help---
 	  Use PAT attributes to setup page level cache control.
 
@@ -1505,7 +1552,7 @@ config ARCH_RANDOM
 
 config EFI
 	bool "EFI runtime service support"
-	depends on ACPI
+	depends on ACPI && !XEN_UNPRIVILEGED_GUEST
 	---help---
 	  This enables the kernel to use EFI runtime services that are
 	  available (such as the EFI variable services).
@@ -1519,7 +1566,7 @@ config EFI
 
 config EFI_STUB
        bool "EFI stub support"
-       depends on EFI
+       depends on EFI && !XEN
        ---help---
           This kernel feature allows a bzImage to be loaded directly
 	  by EFI firmware without the use of a bootloader.
@@ -1560,6 +1607,7 @@ source kernel/Kconfig.hz
 
 config KEXEC
 	bool "kexec system call"
+	depends on !XEN_UNPRIVILEGED_GUEST
 	---help---
 	  kexec is a system call that implements the ability to shutdown your
 	  current kernel, and to start another kernel.  It is like a reboot
@@ -1577,6 +1625,7 @@ config KEXEC
 config CRASH_DUMP
 	bool "kernel crash dumps"
 	depends on X86_64 || (X86_32 && HIGHMEM)
+	depends on !XEN
 	---help---
 	  Generate crash dump after being started by kexec.
 	  This should be normally only set in special crash dump kernels
@@ -1597,7 +1646,8 @@ config KEXEC_JUMP
 	  code in physical address mode via KEXEC
 
 config PHYSICAL_START
-	hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
+	hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP || XEN)
+	default 0x100000 if XEN
 	default "0x1000000"
 	---help---
 	  This gives the physical address where the kernel is loaded.
@@ -1639,6 +1689,7 @@ config PHYSICAL_START
 
 config RELOCATABLE
 	bool "Build a relocatable kernel"
+	depends on !XEN
 	default y
 	---help---
 	  This builds a kernel image that retains relocation information
@@ -1660,7 +1711,8 @@ config X86_NEED_RELOCS
 	depends on X86_32 && RELOCATABLE
 
 config PHYSICAL_ALIGN
-	hex "Alignment value to which kernel should be aligned" if X86_32
+	hex "Alignment value to which kernel should be aligned" if X86_32 && !XEN
+	default 0x2000 if XEN
 	default "0x1000000"
 	range 0x2000 0x1000000
 	---help---
@@ -1753,6 +1805,7 @@ endmenu
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 	depends on X86_64 || (X86_32 && HIGHMEM)
+	depends on !XEN
 
 config ARCH_ENABLE_MEMORY_HOTREMOVE
 	def_bool y
@@ -1770,6 +1823,8 @@ config ARCH_HIBERNATION_HEADER
 
 source "kernel/power/Kconfig"
 
+if !XEN_UNPRIVILEGED_GUEST
+
 source "drivers/acpi/Kconfig"
 
 source "drivers/sfi/Kconfig"
@@ -1780,7 +1835,7 @@ config X86_APM_BOOT
 
 menuconfig APM
 	tristate "APM (Advanced Power Management) BIOS support"
-	depends on X86_32 && PM_SLEEP
+	depends on X86_32 && PM_SLEEP && !XEN
 	---help---
 	  APM is a BIOS specification for saving power using several different
 	  techniques. This is mostly useful for battery powered laptops with
@@ -1905,6 +1960,8 @@ source "drivers/cpuidle/Kconfig"
 
 source "drivers/idle/Kconfig"
 
+endif # !XEN_UNPRIVILEGED_GUEST
+
 endmenu
 
 
@@ -1914,6 +1971,7 @@ config PCI
 	bool "PCI support"
 	default y
 	select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
+	select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
 	---help---
 	  Find out whether you have a PCI motherboard. PCI is the name of a
 	  bus system, i.e. the way the CPU talks to the other stuff inside
@@ -1941,25 +1999,36 @@ choice
 
 config PCI_GOBIOS
 	bool "BIOS"
+	depends on !XEN
 
 config PCI_GOMMCONFIG
 	bool "MMConfig"
+	depends on !XEN_UNPRIVILEGED_GUEST
 
 config PCI_GODIRECT
 	bool "Direct"
+	depends on !XEN_UNPRIVILEGED_GUEST
 
 config PCI_GOOLPC
 	bool "OLPC XO-1"
-	depends on OLPC
+	depends on OLPC && !XEN_UNPRIVILEGED_GUEST
+
+config PCI_GOXEN_FE
+	bool "Xen PCI Frontend"
+	depends on X86_XEN
+	help
+	  The PCI device frontend driver allows the kernel to import arbitrary
+	  PCI devices from a PCI backend to support PCI driver domains.
 
 config PCI_GOANY
 	bool "Any"
+	depends on !XEN_UNPRIVILEGED_GUEST
 
 endchoice
 
 config PCI_BIOS
 	def_bool y
-	depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
+	depends on X86_32 && PCI && !XEN && (PCI_GOBIOS || PCI_GOANY)
 
 # x86-64 doesn't support PCI BIOS access from long mode so always go direct.
 config PCI_DIRECT
@@ -1976,7 +2045,7 @@ config PCI_OLPC
 
 config PCI_XEN
 	def_bool y
-	depends on PCI && XEN
+	depends on PCI && PARAVIRT_XEN
 	select SWIOTLB_XEN
 
 config PCI_DOMAINS
@@ -2007,7 +2076,7 @@ source "drivers/pci/Kconfig"
 
 # x86_64 have no ISA slots, but can have ISA-style DMA.
 config ISA_DMA_API
-	bool "ISA-style DMA support" if (X86_64 && EXPERT)
+	bool "ISA-style DMA support" if ((X86_64 || XEN) && EXPERT) || XEN_UNPRIVILEGED_GUEST
 	default y
 	help
 	  Enables ISA-style DMA support for devices requiring such controllers.
@@ -2017,6 +2086,7 @@ if X86_32
 
 config ISA
 	bool "ISA support"
+	depends on !XEN
 	---help---
 	  Find out whether you have ISA slots on your motherboard.  ISA is the
 	  name of a bus system, i.e. the way the CPU talks to the other stuff
@@ -2044,6 +2114,7 @@ source "drivers/eisa/Kconfig"
 
 config MCA
 	bool "MCA support"
+	depends on !XEN
 	---help---
 	  MicroChannel Architecture is found in some IBM PS/2 machines and
 	  laptops.  It is a bus system similar to PCI or ISA. See
@@ -2075,7 +2146,7 @@ config SCx200HR_TIMER
 
 config OLPC
 	bool "One Laptop Per Child support"
-	depends on !X86_PAE
+	depends on !X86_PAE && !XEN
 	select GPIOLIB
 	select OF
 	select OF_PROMTREE
@@ -2140,7 +2211,7 @@ endif # X86_32
 
 config AMD_NB
 	def_bool y
-	depends on CPU_SUP_AMD && PCI
+	depends on CPU_SUP_AMD && PCI && !XEN_UNPRIVILEGED_GUEST
 
 source "drivers/pcmcia/Kconfig"
 
@@ -2210,7 +2281,9 @@ source "net/Kconfig"
 
 source "drivers/Kconfig"
 
+if !XEN_UNPRIVILEGED_GUEST
 source "drivers/firmware/Kconfig"
+endif
 
 source "fs/Kconfig"
 
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 3c57033..028a6cf 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -6,7 +6,7 @@ choice
 
 config M386
 	bool "386"
-	depends on X86_32 && !UML
+	depends on X86_32 && !UML && !XEN
 	---help---
 	  This is the processor type of your CPU. This information is used for
 	  optimizing purposes. In order to compile a kernel that can run on
@@ -47,7 +47,7 @@ config M386
 
 config M486
 	bool "486"
-	depends on X86_32
+	depends on X86_32 && !XEN
 	---help---
 	  Select this for a 486 series processor, either Intel or one of the
 	  compatible processors from AMD, Cyrix, IBM, or Intel.  Includes DX,
@@ -56,7 +56,7 @@ config M486
 
 config M586
 	bool "586/K5/5x86/6x86/6x86MX"
-	depends on X86_32
+	depends on X86_32 && !XEN
 	---help---
 	  Select this for an 586 or 686 series processor such as the AMD K5,
 	  the Cyrix 5x86, 6x86 and 6x86MX.  This choice does not
@@ -64,14 +64,14 @@ config M586
 
 config M586TSC
 	bool "Pentium-Classic"
-	depends on X86_32
+	depends on X86_32 && !XEN
 	---help---
 	  Select this for a Pentium Classic processor with the RDTSC (Read
 	  Time Stamp Counter) instruction for benchmarking.
 
 config M586MMX
 	bool "Pentium-MMX"
-	depends on X86_32
+	depends on X86_32 && !XEN
 	---help---
 	  Select this for a Pentium with the MMX graphics/multimedia
 	  extended instructions.
@@ -396,6 +396,7 @@ config X86_P6_NOP
 config X86_TSC
 	def_bool y
 	depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64
+	depends on !XEN
 
 config X86_CMPXCHG64
 	def_bool y
@@ -441,7 +442,7 @@ config CPU_SUP_INTEL
 config CPU_SUP_CYRIX_32
 	default y
 	bool "Support Cyrix processors" if PROCESSOR_SELECT
-	depends on !64BIT
+	depends on !64BIT && !XEN
 	---help---
 	  This enables detection, tunings and quirks for Cyrix processors
 
@@ -495,7 +496,7 @@ config CPU_SUP_TRANSMETA_32
 config CPU_SUP_UMC_32
 	default y
 	bool "Support UMC processors" if PROCESSOR_SELECT
-	depends on !64BIT
+	depends on !64BIT && !XEN
 	---help---
 	  This enables detection, tunings and quirks for UMC processors
 
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index e46c214..2b27700 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -25,6 +25,7 @@ config STRICT_DEVMEM
 config X86_VERBOSE_BOOTUP
 	bool "Enable verbose x86 bootup info messages"
 	default y
+	depends on !XEN
 	---help---
 	  Enables the informational output from the decompression stage
 	  (e.g. bzImage) of the boot. If you disable this you will still
@@ -32,6 +33,7 @@ config X86_VERBOSE_BOOTUP
 
 config EARLY_PRINTK
 	bool "Early printk" if EXPERT
+	depends on !XEN_UNPRIVILEGED_GUEST
 	default y
 	---help---
 	  Write kernel log output directly into the VGA buffer or to a serial
@@ -122,7 +124,7 @@ config DEBUG_NX_TEST
 config DOUBLEFAULT
 	default y
 	bool "Enable doublefault exception handler" if EXPERT
-	depends on X86_32
+	depends on X86_32 && !X86_NO_TSS
 	---help---
 	  This option allows trapping of rare doublefault exceptions that
 	  would otherwise cause a system to silently reboot. Disabling this
@@ -162,6 +164,7 @@ config IOMMU_LEAK
 
 config HAVE_MMIOTRACE_SUPPORT
 	def_bool y
+	depends on !XEN
 
 config X86_DECODER_SELFTEST
 	bool "x86 instruction decoder selftest"
@@ -250,6 +253,7 @@ config DEBUG_BOOT_PARAMS
 	bool "Debug boot parameters"
 	depends on DEBUG_KERNEL
 	depends on DEBUG_FS
+	depends on !XEN
 	---help---
 	  This option will cause struct boot_params to be exported via debugfs.
 
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a2bfff7..67238b0 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -157,8 +157,27 @@ boot := arch/x86/boot
 
 BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
 
-PHONY += bzImage $(BOOT_TARGETS)
+PHONY += bzImage vmlinuz $(BOOT_TARGETS)
 
+ifdef CONFIG_XEN
+LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
+	-I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE)
+
+ifdef CONFIG_X86_64
+LDFLAGS_vmlinux := -e startup_64
+endif
+
+# Default kernel to build
+all: vmlinuz
+
+# KBUILD_IMAGE specifies the target image being built
+KBUILD_IMAGE := $(boot)/vmlinuz
+
+vmlinuz: vmlinux
+	$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
+	$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
+	$(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+else
 # Default kernel to build
 all: bzImage
 
@@ -172,6 +191,7 @@ endif
 	$(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
 	$(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
 	$(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+endif
 
 $(BOOT_TARGETS): vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $@
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 95365a8..3134798 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -17,6 +17,7 @@
 SVGA_MODE	:= -DSVGA_MODE=NORMAL_VGA
 
 targets		:= vmlinux.bin setup.bin setup.elf bzImage
+targets		+= vmlinuz vmlinux-stripped
 targets		+= fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-		:= compressed
 
@@ -188,6 +189,20 @@ bzlilo: $(obj)/bzImage
 	cp System.map $(INSTALL_PATH)/
 	if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
 
+$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE
+	$(call if_changed,gzip)
+	@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+
+$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded
+$(obj)/vmlinux-stripped: vmlinux FORCE
+	$(call if_changed,objcopy)
+
+ifndef CONFIG_XEN
+bzImage := bzImage
+else
+bzImage := vmlinuz
+endif
+
 install:
-	sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+	sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \
 		System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/ia32/ia32entry-xen.S b/arch/x86/ia32/ia32entry-xen.S
new file mode 100644
index 0000000..dce835b
--- /dev/null
+++ b/arch/x86/ia32/ia32entry-xen.S
@@ -0,0 +1,383 @@
+/*
+ * Compatibility mode system call entry point for x86-64. 
+ * 		
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */		 
+
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>	
+#include <asm/thread_info.h>	
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <linux/linkage.h>
+#include <linux/err.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE	   0x40000000
+
+	.section .entry.text, "ax"
+
+	.macro IA32_ARG_FIXUP noebp=0
+	movl	%edi,%r8d
+	.if \noebp
+	jmp	.Lia32_common
+	.else
+	movl	%ebp,%r9d
+.Lia32_common:
+	.endif
+	xchg	%ecx,%esi
+	movl	%ebx,%edi
+	movl	%edx,%edx	/* zero extension */
+	.endm 
+
+	/* clobbers %eax */	
+	.macro  CLEAR_RREGS offset=0, _r9=rax
+	xorl 	%eax,%eax
+	movq	%rax,\offset+R11(%rsp)
+	movq	%rax,\offset+R10(%rsp)
+	movq	%\_r9,\offset+R9(%rsp)
+	movq	%rax,\offset+R8(%rsp)
+	.endm
+
+	/*
+	 * Reload arg registers from stack in case ptrace changed them.
+	 * We don't reload %eax because syscall_trace_enter() returned
+	 * the %rax value we should see.  Instead, we just truncate that
+	 * value to 32 bits again as we did on entry from user mode.
+	 * If it's a new value set by user_regset during entry tracing,
+	 * this matches the normal truncation of the user-mode value.
+	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
+	 * an appropriately invalid value.
+	 */
+	.macro LOAD_ARGS32 offset, _r9=0
+	.if \_r9
+	movl \offset+16(%rsp),%r9d
+	.endif
+	movl \offset+40(%rsp),%ecx
+	movl \offset+48(%rsp),%edx
+	movl \offset+56(%rsp),%esi
+	movl \offset+64(%rsp),%edi
+	movl %eax,%eax			/* zero extension */
+	.endm
+
+	.macro CFI_STARTPROC32 simple
+	CFI_STARTPROC	\simple
+	CFI_UNDEFINED	r8
+	CFI_UNDEFINED	r9
+	CFI_UNDEFINED	r10
+	CFI_UNDEFINED	r11
+	CFI_UNDEFINED	r12
+	CFI_UNDEFINED	r13
+	CFI_UNDEFINED	r14
+	CFI_UNDEFINED	r15
+	.endm
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+	swapgs
+	sysretl
+ENDPROC(native_usergs_sysret32)
+
+ENTRY(native_irq_enable_sysexit)
+	swapgs
+	sti
+	sysexit
+ENDPROC(native_irq_enable_sysexit)
+#endif
+
+/*
+ * 32bit SYSENTER instruction entry.
+ *
+ * Arguments:
+ * %eax	System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp user stack
+ * 0(%ebp) Arg6	
+ * 	
+ * Interrupts on.
+ *	
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below.	Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */ 	
+ENTRY(ia32_sysenter_target)
+	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA	rsp,SS+8-RIP+16
+	/*CFI_REL_OFFSET	ss,SS-RIP+16*/
+	CFI_REL_OFFSET	rsp,RSP-RIP+16
+	/*CFI_REL_OFFSET	rflags,EFLAGS-RIP+16*/
+	/*CFI_REL_OFFSET	cs,CS-RIP+16*/
+	CFI_REL_OFFSET	rip,RIP-RIP+16
+	CFI_REL_OFFSET	r11,8
+	CFI_REL_OFFSET	rcx,0
+	movq	8(%rsp),%r11
+	CFI_RESTORE	r11
+	popq_cfi %rcx
+	CFI_RESTORE	rcx
+ 	movl	%ebp,%ebp		/* zero extension */
+	movl	%eax,%eax
+	movl	TI_sysenter_return+THREAD_INFO(%rsp,8*6-KERNEL_STACK_OFFSET),%r10d
+	movl	$__USER32_DS,40(%rsp)
+	movq	%rbp,32(%rsp)
+	movl	$__USER32_CS,16(%rsp)
+	movq	%r10,8(%rsp)
+	movq	%rax,(%rsp)
+	cld
+	SAVE_ARGS 0,1,0
+ 	/* no need to do an access_ok check here because rbp has been
+ 	   32bit zero extended */ 
+1:	movl	(%rbp),%ebp
+ 	.section __ex_table,"a"
+ 	.quad 1b,ia32_badarg
+ 	.previous	
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	jnz  sysenter_tracesys
+	jmp .Lia32_check_call
+
+#ifdef CONFIG_AUDITSYSCALL
+	.macro auditsys_entry_common
+	movl %esi,%r9d			/* 6th arg: 4th syscall arg */
+	movl %edx,%r8d			/* 5th arg: 3rd syscall arg */
+	/* (already in %ecx)		   4th arg: 2nd syscall arg */
+	movl %ebx,%edx			/* 3rd arg: 1st syscall arg */
+	movl %eax,%esi			/* 2nd arg: syscall number */
+	movl $AUDIT_ARCH_I386,%edi	/* 1st arg: audit arch */
+	call __audit_syscall_entry
+	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
+	cmpq $(IA32_NR_syscalls-1),%rax
+	ja ia32_badsys
+	movl %ebx,%edi			/* reload 1st syscall arg */
+	movl RCX-ARGOFFSET(%rsp),%esi	/* reload 2nd syscall arg */
+	movl RDX-ARGOFFSET(%rsp),%edx	/* reload 3rd syscall arg */
+	movl RSI-ARGOFFSET(%rsp),%ecx	/* reload 4th syscall arg */
+	movl RDI-ARGOFFSET(%rsp),%r8d	/* reload 5th syscall arg */
+	.endm
+
+sysenter_auditsys:
+	auditsys_entry_common
+	movl %ebp,%r9d			/* reload 6th syscall arg */
+	jmp .Lia32_dispatch
+#endif
+	CFI_ENDPROC
+ENDPROC(ia32_sysenter_target)
+
+/*
+ * 32bit SYSCALL instruction entry.
+ *
+ * Arguments:
+ * %eax	System call number.
+ * %ebx Arg1
+ * %ecx return EIP 
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
+ * %esp user stack 
+ * 0(%esp) Arg6
+ * 	
+ * Interrupts on.
+ *	
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below.	Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.	
+ */ 	
+ENTRY(ia32_cstar_target)
+	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA	rsp,SS+8-RIP+16
+	/*CFI_REL_OFFSET	ss,SS-RIP+16*/
+	CFI_REL_OFFSET	rsp,RSP-RIP+16
+	/*CFI_REL_OFFSET	rflags,EFLAGS-RIP+16*/
+	/*CFI_REL_OFFSET	cs,CS-RIP+16*/
+	CFI_REL_OFFSET	rip,RIP-RIP+16
+	movl 	%eax,%eax	/* zero extension */
+	movl	RSP-RIP+16(%rsp),%r8d
+	SAVE_ARGS -8,0,0
+	movq	%rax,ORIG_RAX-ARGOFFSET(%rsp)
+	movq	%rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+	movl	%ebp,%ecx
+	movl	$__USER32_CS,CS-ARGOFFSET(%rsp)
+	movl	$__USER32_DS,SS-ARGOFFSET(%rsp)
+	/* no need to do an access_ok check here because r8 has been
+	   32bit zero extended */ 
+	/* hardware stack frame is complete now */	
+1:	movl	(%r8),%r9d
+	.section __ex_table,"a"
+	.quad 1b,ia32_badarg
+	.previous	
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	jnz   cstar_tracesys
+	cmpq $IA32_NR_syscalls-1,%rax
+	ja  ia32_badsys
+cstar_do_call:
+	IA32_ARG_FIXUP 1
+	
+#ifdef CONFIG_AUDITSYSCALL
+cstar_auditsys:
+	movl %r9d,R9-ARGOFFSET(%rsp)	/* register to be clobbered by call */
+	auditsys_entry_common
+	movl R9-ARGOFFSET(%rsp),%r9d	/* reload 6th syscall arg */
+	jmp .Lia32_dispatch
+#endif
+
+cstar_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	jz cstar_auditsys
+#endif
+	xchgl %r9d,%ebp
+	SAVE_REST
+	CLEAR_RREGS 0, r9
+	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
+	movq %rsp,%rdi        /* &pt_regs -> arg1 */
+	call syscall_trace_enter
+	LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	xchgl %ebp,%r9d
+	cmpq $(IA32_NR_syscalls-1),%rax
+	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
+	jmp cstar_do_call
+END(ia32_cstar_target)
+				
+ia32_badarg:
+	movq $-EFAULT,%rax
+	jmp ia32_sysret
+	CFI_ENDPROC
+
+/* 
+ * Emulated IA32 system calls via int 0x80. 
+ *
+ * Arguments:	 
+ * %eax	System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.	
+ * All registers except %eax must be saved (but ptrace may violate that)
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts on.
+ */ 				
+
+ENTRY(ia32_syscall)
+	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA	rsp,SS+8-RIP+16
+	/*CFI_REL_OFFSET	ss,SS-RIP+16*/
+	CFI_REL_OFFSET	rsp,RSP-RIP+16
+	/*CFI_REL_OFFSET	rflags,EFLAGS-RIP+16*/
+	/*CFI_REL_OFFSET	cs,CS-RIP+16*/
+	CFI_REL_OFFSET	rip,RIP-RIP+16
+	CFI_REL_OFFSET	r11,8
+	CFI_REL_OFFSET	rcx,0
+	movq 8(%rsp),%r11
+	CFI_RESTORE	r11
+	popq_cfi %rcx
+	CFI_RESTORE	rcx
+	movl %eax,%eax
+	movq %rax,(%rsp)
+	cld
+	/* note the registers are not zero extended to the sf.
+	   this could be a problem. */
+	SAVE_ARGS 0,1,0
+	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	jnz ia32_tracesys
+.Lia32_check_call:
+	cmpq $(IA32_NR_syscalls-1),%rax
+	ja ia32_badsys
+ia32_do_call:
+	IA32_ARG_FIXUP
+.Lia32_dispatch:
+	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
+ia32_sysret:
+	movq %rax,RAX-ARGOFFSET(%rsp)
+	CLEAR_RREGS -ARGOFFSET
+	jmp int_ret_from_sys_call 
+
+sysenter_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	jz sysenter_auditsys
+#endif
+ia32_tracesys:			 
+	SAVE_REST
+	CLEAR_RREGS
+	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
+	movq %rsp,%rdi        /* &pt_regs -> arg1 */
+	call syscall_trace_enter
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	cmpq $(IA32_NR_syscalls-1),%rax
+	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
+	jmp ia32_do_call
+END(ia32_syscall)
+
+ia32_badsys:
+	movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+	movq $-ENOSYS,%rax
+	jmp ia32_sysret
+
+	CFI_ENDPROC
+	
+	.macro PTREGSCALL label, func, arg
+	ALIGN
+GLOBAL(\label)
+	leaq \func(%rip),%rax
+	leaq -ARGOFFSET+8(%rsp),\arg	/* 8 for return address */
+	jmp  ia32_ptregs_common	
+	.endm
+
+	CFI_STARTPROC32
+
+	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
+	PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
+	PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
+	PTREGSCALL stub32_execve, sys32_execve, %rcx
+	PTREGSCALL stub32_fork, sys_fork, %rdi
+	PTREGSCALL stub32_clone, sys32_clone, %rdx
+	PTREGSCALL stub32_vfork, sys_vfork, %rdi
+	PTREGSCALL stub32_iopl, sys_iopl, %rsi
+
+	ALIGN
+ia32_ptregs_common:
+	popq %r11
+	CFI_ENDPROC
+	CFI_STARTPROC32	simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
+	CFI_REL_OFFSET	rax,RAX-ARGOFFSET
+	CFI_REL_OFFSET	rcx,RCX-ARGOFFSET
+	CFI_REL_OFFSET	rdx,RDX-ARGOFFSET
+	CFI_REL_OFFSET	rsi,RSI-ARGOFFSET
+	CFI_REL_OFFSET	rdi,RDI-ARGOFFSET
+	CFI_REL_OFFSET	rip,RIP-ARGOFFSET
+/*	CFI_REL_OFFSET	cs,CS-ARGOFFSET*/
+/*	CFI_REL_OFFSET	rflags,EFLAGS-ARGOFFSET*/
+	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
+/*	CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
+	SAVE_REST
+	call *%rax
+	RESTORE_REST
+	jmp  ia32_sysret	/* misbalances the return cache */
+	CFI_ENDPROC
+END(ia32_ptregs_common)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 610001d..d39db8a 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -31,6 +31,10 @@
 #include <asm/mpspec.h>
 #include <asm/trampoline.h>
 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#include <xen/interface/platform.h>
+#endif
+
 #define COMPILER_DEPENDENT_INT64   long long
 #define COMPILER_DEPENDENT_UINT64  unsigned long long
 
@@ -115,7 +119,11 @@ static inline void acpi_disable_pci(void)
 }
 
 /* Low-level suspend routine. */
+#ifdef CONFIG_ACPI_PV_SLEEP
+#define acpi_suspend_lowlevel() acpi_enter_sleep_state(ACPI_STATE_S3)
+#else
 extern int acpi_suspend_lowlevel(void);
+#endif
 
 extern const unsigned char acpi_wakeup_code[];
 #define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
@@ -123,11 +131,33 @@ extern const unsigned char acpi_wakeup_code[];
 /* early initialization routine */
 extern void acpi_reserve_wakeup_memory(void);
 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static inline int acpi_notify_hypervisor_state(u8 sleep_state,
+					       u32 pm1a_cnt_val,
+					       u32 pm1b_cnt_val)
+{
+	struct xen_platform_op op = {
+		.cmd = XENPF_enter_acpi_sleep,
+		.interface_version = XENPF_INTERFACE_VERSION,
+		.u = {
+			.enter_acpi_sleep = {
+				.pm1a_cnt_val = pm1a_cnt_val,
+				.pm1b_cnt_val = pm1b_cnt_val,
+				.sleep_state = sleep_state,
+			},
+		},
+	};
+
+	return HYPERVISOR_platform_op(&op);
+}
+#endif
+
 /*
  * Check if the CPU can handle C2 and deeper
  */
 static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
 {
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 	/*
 	 * Early models (<=5) of AMD Opterons are not supposed to go into
 	 * C2 state.
@@ -142,6 +172,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
 	else if (amd_e400_c1e_detected)
 		return 1;
 	else
+#endif
 		return max_cstate;
 }
 
@@ -181,7 +212,9 @@ static inline void disable_acpi(void) { }
 
 #endif /* !CONFIG_ACPI */
 
+#ifndef CONFIG_XEN
 #define ARCH_HAS_POWER_INIT	1
+#endif
 
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index eec2a70..91e72c0 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -15,6 +15,9 @@
 #define map_page_into_agp(page) set_pages_uc(page, 1)
 #define unmap_page_from_agp(page) set_pages_wb(page, 1)
 
+#define map_pages_into_agp set_pages_array_uc
+#define unmap_pages_from_agp set_pages_array_wb
+
 /*
  * Could use CLFLUSH here if the cpu supports it. But then it would
  * need to be called for each cacheline of the whole page so it may
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3ab9bdd..be3c6f6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -9,12 +9,16 @@
 #include <asm/processor.h>
 #include <asm/apicdef.h>
 #include <linux/atomic.h>
+#ifndef CONFIG_XEN
 #include <asm/fixmap.h>
+#endif
 #include <asm/mpspec.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 
+#ifndef CONFIG_XEN
 #define ARCH_APICTIMER_STOPS_ON_C3	1
+#endif
 
 /*
  * Debugging macros
@@ -46,6 +50,7 @@ static inline void generic_apic_probe(void)
 #ifdef CONFIG_X86_LOCAL_APIC
 
 extern unsigned int apic_verbosity;
+#ifndef CONFIG_XEN
 extern int local_apic_timer_c2_ok;
 
 extern int disable_apic;
@@ -119,6 +124,8 @@ extern u64 native_apic_icr_read(void);
 
 extern int x2apic_mode;
 
+#endif /* CONFIG_XEN */
+
 #ifdef CONFIG_X86_X2APIC
 /*
  * Make previous memory operations globally visible before
@@ -238,7 +245,11 @@ extern void setup_local_APIC(void);
 extern void end_local_APIC_setup(void);
 extern void bsp_end_local_APIC_setup(void);
 extern void init_apic_mappings(void);
+#ifndef CONFIG_XEN
 void register_lapic_address(unsigned long address);
+#else
+#define register_lapic_address(address)
+#endif
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
 extern int APIC_init_uniprocessor(void);
@@ -286,15 +297,18 @@ static inline void disable_local_APIC(void) { }
 struct apic {
 	char *name;
 
+#ifndef CONFIG_XEN
 	int (*probe)(void);
 	int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
 	int (*apic_id_registered)(void);
+#endif
 
 	u32 irq_delivery_mode;
 	u32 irq_dest_mode;
 
 	const struct cpumask *(*target_cpus)(void);
 
+#ifndef CONFIG_XEN
 	int disable_esr;
 
 	int dest_logical;
@@ -313,8 +327,10 @@ struct apic {
 	void (*setup_portio_remap)(void);
 	int (*check_phys_apicid_present)(int phys_apicid);
 	void (*enable_apic_mode)(void);
+#endif
 	int (*phys_pkg_id)(int cpuid_apic, int index_msb);
 
+#ifndef CONFIG_XEN
 	/*
 	 * When one of the next two hooks returns 1 the apic
 	 * is switched to this. Essentially they are additional
@@ -329,6 +345,7 @@ struct apic {
 	unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
 	unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
 					       const struct cpumask *andmask);
+#endif
 
 	/* ipi */
 	void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@ -338,6 +355,7 @@ struct apic {
 	void (*send_IPI_all)(int vector);
 	void (*send_IPI_self)(int vector);
 
+#ifndef CONFIG_XEN
 	/* wakeup_secondary_cpu */
 	int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
 
@@ -377,6 +395,7 @@ struct apic {
 	 */
 	int (*x86_32_numa_cpu_node)(int cpu);
 #endif
+#endif /* CONFIG_XEN */
 };
 
 /*
@@ -386,6 +405,8 @@ struct apic {
  */
 extern struct apic *apic;
 
+#ifndef CONFIG_XEN
+
 /*
  * APIC drivers are probed based on how they are listed in the .apicdrivers
  * section. So the order is important and enforced by the ordering
@@ -504,6 +525,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert)
 
 extern void generic_bigsmp_probe(void);
 
+#endif /* CONFIG_XEN */
 
 #ifdef CONFIG_X86_LOCAL_APIC
 
@@ -520,6 +542,8 @@ static inline const struct cpumask *default_target_cpus(void)
 #endif
 }
 
+#ifndef CONFIG_XEN
+
 DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
 
 
@@ -626,6 +650,8 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
 extern int default_check_phys_apicid_present(int phys_apicid);
 #endif
 
+#endif /* CONFIG_XEN */
+
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 134bba0..96fd18b 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -17,6 +17,8 @@
  */
 #define IO_APIC_SLOT_SIZE		1024
 
+#ifndef CONFIG_XEN
+
 #define	APIC_ID		0x20
 
 #define	APIC_LVR	0x30
@@ -147,6 +149,16 @@
 #define XAPIC_ENABLE	(1UL << 11)
 #define X2APIC_ENABLE	(1UL << 10)
 
+#else /* CONFIG_XEN */
+
+enum {
+	APIC_DEST_ALLBUT = 0x1,
+	APIC_DEST_SELF,
+	APIC_DEST_ALLINC
+};
+
+#endif /* CONFIG_XEN */
+
 #ifdef CONFIG_X86_32
 # define MAX_IO_APICS 64
 # define MAX_LOCAL_APIC 256
@@ -155,6 +167,8 @@
 # define MAX_LOCAL_APIC 32768
 #endif
 
+#ifndef CONFIG_XEN
+
 /*
  * All x86-64 systems are xAPIC compatible.
  * In the following, "apicid" is a physical APIC ID.
@@ -425,6 +439,8 @@ struct local_apic {
 
 #undef u32
 
+#endif /* CONFIG_XEN */
+
 #ifdef CONFIG_X86_32
  #define BAD_APICID 0xFFu
 #else
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 5e1a2ee..2d2275a 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -16,7 +16,7 @@
 				& ~(CONFIG_PHYSICAL_ALIGN - 1))
 
 /* Minimum kernel alignment, as a power of two */
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
 #define MIN_KERNEL_ALIGN_LG2	PMD_SHIFT
 #else
 #define MIN_KERNEL_ALIGN_LG2	(PAGE_SHIFT + THREAD_ORDER)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index dcb839e..dc9a154 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -293,7 +293,11 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_xmm4_1		boot_cpu_has(X86_FEATURE_XMM4_1)
 #define cpu_has_xmm4_2		boot_cpu_has(X86_FEATURE_XMM4_2)
 #define cpu_has_x2apic		boot_cpu_has(X86_FEATURE_X2APIC)
+#ifndef CONFIG_XEN
 #define cpu_has_xsave		boot_cpu_has(X86_FEATURE_XSAVE)
+#else
+#define cpu_has_xsave		boot_cpu_has(X86_FEATURE_OSXSAVE)
+#endif
 #define cpu_has_osxsave		boot_cpu_has(X86_FEATURE_OSXSAVE)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq	boot_cpu_has(X86_FEATURE_PCLMULQDQ)
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index b903d5e..32cc72e 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -101,7 +101,7 @@ extern void aout_dump_debugregs(struct user *dump);
 
 extern void hw_breakpoint_restore(void);
 
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_IDT)
 DECLARE_PER_CPU(int, debug_stack_usage);
 static inline void debug_stack_usage_inc(void)
 {
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 3778256..ba52483 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -66,7 +66,11 @@ struct e820map {
 	struct e820entry map[E820_X_MAX];
 };
 
+#ifndef CONFIG_XEN
 #define ISA_START_ADDRESS	0xa0000
+#else
+#define ISA_START_ADDRESS	0
+#endif
 #define ISA_END_ADDRESS		0x100000
 
 #define BIOS_BEGIN		0x000a0000
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index da0b3ca..b0ce06c 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -19,7 +19,11 @@ typedef struct {
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
+#ifndef CONFIG_XEN
 	unsigned int irq_tlb_count;
+#else
+	unsigned int irq_lock_count;
+#endif
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
 	unsigned int irq_thermal_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index eb92a6e..b54fa50 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -101,6 +101,7 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
 	irq_attr->polarity	= polarity;
 }
 
+#ifndef CONFIG_XEN
 struct irq_2_iommu {
 	struct intel_iommu *iommu;
 	u16 irte_index;
@@ -123,6 +124,9 @@ struct irq_cfg {
 	struct irq_2_iommu	irq_2_iommu;
 #endif
 };
+#else
+struct irq_cfg;
+#endif
 
 extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
 extern void send_cleanup_vector(struct irq_cfg *);
@@ -159,9 +163,15 @@ extern void smp_invalidate_interrupt(struct pt_regs *);
 #else
 extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
 #endif
+extern void smp_irq_work_interrupt(struct pt_regs *);
+#ifdef CONFIG_XEN
+extern void smp_reboot_interrupt(struct pt_regs *);
+#endif
 #endif
 
+#ifndef CONFIG_XEN
 extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
+#endif
 
 typedef int vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 7a15153..2bb1d90 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -60,3 +60,7 @@ static inline bool hypervisor_x2apic_available(void)
 }
 
 #endif
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include_next <asm/hypervisor.h>
+#endif
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index a203659..b9daf61 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -54,6 +54,7 @@ extern struct irq_chip i8259A_chip;
 
 struct legacy_pic {
 	int nr_legacy_irqs;
+#ifndef CONFIG_XEN
 	struct irq_chip *chip;
 	void (*mask)(unsigned int irq);
 	void (*unmask)(unsigned int irq);
@@ -61,6 +62,7 @@ struct legacy_pic {
 	void (*restore_mask)(void);
 	void (*init)(int auto_eoi);
 	int (*irq_pending)(unsigned int irq);
+#endif
 	void (*make_irq)(unsigned int irq);
 };
 
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d8e8eef..51cd71d 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -331,7 +331,7 @@ extern void early_iounmap(void __iomem *addr, unsigned long size);
 extern void fixup_early_ioremap(void);
 extern bool is_early_ioremap_ptep(pte_t *ptep);
 
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 #include <xen/xen.h>
 struct bio_vec;
 
@@ -341,7 +341,7 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 #define BIOVEC_PHYS_MERGEABLE(vec1, vec2)				\
 	(__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&				\
 	 (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
-#endif	/* CONFIG_XEN */
+#endif	/* CONFIG_PARAVIRT_XEN */
 
 #define IO_SPACE_LIMIT 0xffff
 
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff17..2e587bc 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -5,14 +5,30 @@
 # define PA_CONTROL_PAGE	0
 # define VA_CONTROL_PAGE	1
 # define PA_PGD			2
+# ifndef CONFIG_XEN
 # define PA_SWAP_PAGE		3
 # define PAGES_NR		4
+# else /* CONFIG_XEN */
+/*
+ * The hypervisor interface implicitly requires that all entries (except
+ * for possibly the final one) are arranged in matching PA_/VA_ pairs.
+#  define VA_PGD		3
+ */
+#  define PA_SWAP_PAGE		4
+#  define PAGES_NR		5
+# endif /* CONFIG_XEN */
 #else
 # define PA_CONTROL_PAGE	0
 # define VA_CONTROL_PAGE	1
 # define PA_TABLE_PAGE		2
+# ifndef CONFIG_XEN
 # define PA_SWAP_PAGE		3
 # define PAGES_NR		4
+# else /* CONFIG_XEN, see comment above
+#  define VA_TABLE_PAGE		3 */
+#  define PA_SWAP_PAGE		4
+#  define PAGES_NR		5
+# endif /* CONFIG_XEN */
 #endif
 
 # define KEXEC_CONTROL_CODE_MAX_SIZE	2048
@@ -163,6 +179,19 @@ struct kimage_arch {
 };
 #endif
 
+/* Under Xen we need to work with machine addresses. These macros give the
+ * machine address of a certain page to the generic kexec code instead of
+ * the pseudo physical address which would be given by the default macros.
+ */
+
+#ifdef CONFIG_XEN
+#define KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index a01e7ec7..a39d3e1 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -5,6 +5,8 @@
 #ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
 #define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
 
+#include <linux/nmi.h>
+#include <asm/delay.h>
 #include <asm/mc146818rtc.h>
 
 #define NMI_REASON_PORT		0x61
@@ -22,6 +24,29 @@ static inline unsigned char default_get_nmi_reason(void)
 	return inb(NMI_REASON_PORT);
 }
 
+static inline void clear_serr_error(unsigned char reason)
+{
+	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+	outb(reason, NMI_REASON_PORT);
+}
+
+static inline void clear_io_check_error(unsigned char reason)
+{
+	unsigned long i;
+
+	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+	outb(reason, NMI_REASON_PORT);
+
+	i = 20000;
+	while (--i) {
+		touch_nmi_watchdog();
+		udelay(100);
+	}
+
+	reason &= ~NMI_REASON_CLEAR_IOCHK;
+	outb(reason, NMI_REASON_PORT);
+}
+
 static inline void reassert_nmi(void)
 {
 	int old_reg = -1;
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 0e8e85b..6c8b3a8 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -14,7 +14,7 @@
 #define RTC_ALWAYS_BCD	1	/* RTC operates in binary mode */
 #endif
 
-#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) && defined(__HAVE_ARCH_CMPXCHG)
 /*
  * This lock provides nmi access to the CMOS/RTC registers.  It has some
  * special properties.  It is owned by a CPU and stores the index register
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5f55e69..e9b162a 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -16,12 +16,15 @@ typedef struct {
 	/* True if mm supports a task running in 32 bit compatibility mode. */
 	unsigned short ia32_compat;
 #endif
+#ifdef CONFIG_XEN
+	bool has_foreign_mappings:1;
+#endif
 
 	struct mutex lock;
 	void *vdso;
 } mm_context_t;
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 void leave_mm(int cpu);
 #else
 static inline void leave_mm(int cpu)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index fd3f9f1..d3d8968 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,10 @@ struct ctl_table;
 extern int proc_nmi_enabled(struct ctl_table *, int ,
 			void __user *, size_t *, loff_t *);
 extern int unknown_nmi_panic;
+#endif
 
+#if (defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)) || \
+    (defined(CONFIG_XEN_SMPBOOT) && CONFIG_XEN_COMPAT >= 0x030200)
 void arch_trigger_all_cpu_backtrace(void);
 #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
 #endif
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 7639dbf..63775df 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -69,7 +69,15 @@ extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
 #endif	/* !__ASSEMBLY__ */
 
 #ifdef CONFIG_FLATMEM
+/*
+ * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen
+ * other than for hotplugged memory.
+ */
+#ifndef CONFIG_XEN
 #define pfn_valid(pfn)          ((pfn) < max_pfn)
+#else
+#define pfn_valid(pfn)          ((pfn) < max_mapnr)
+#endif
 #endif
 
 #endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 3566454..b229c07 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -133,6 +133,8 @@ struct pt_regs {
 #include <linux/init.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt_types.h>
+#elif defined(CONFIG_X86_64_XEN)
+#include <xen/interface/xen.h>
 #endif
 
 struct cpuinfo_x86;
@@ -193,7 +195,13 @@ static inline int v8086_mode(struct pt_regs *regs)
 #ifdef CONFIG_X86_64
 static inline bool user_64bit_mode(struct pt_regs *regs)
 {
-#ifndef CONFIG_PARAVIRT
+#if defined(CONFIG_XEN)
+	/*
+	 * On Xen, these are the only long mode CPL 3 selectors.
+	 * We do not allow long mode selectors in the LDT.
+	 */
+	return regs->cs == __USER_CS || regs->cs == FLAT_USER_CS64;
+#elif !defined(CONFIG_PARAVIRT)
 	/*
 	 * On non-paravirt systems, this is the only long mode CPL 3
 	 * selector.  We do not allow long mode selectors in the LDT.
@@ -286,7 +294,9 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
 }
 
 #define arch_has_single_step()	(1)
-#ifdef CONFIG_X86_DEBUGCTLMSR
+#if defined(CONFIG_XEN)
+#define arch_has_block_step()	(0)
+#elif defined(CONFIG_X86_DEBUGCTLMSR)
 #define arch_has_block_step()	(1)
 #else
 #define arch_has_block_step()	(boot_cpu_data.x86 >= 6)
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 6c7fc25..b0549bf 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,7 +48,7 @@
 #endif
 
 #ifdef CONFIG_X86_64
-#ifdef CONFIG_PARAVIRT
+#if defined(CONFIG_PARAVIRT) || defined(CONFIG_XEN)
 /* Paravirtualized systems may not have PSE or PGE available */
 #define NEED_PSE	0
 #define NEED_PGE	0
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 5e64171..1a47771 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -188,7 +188,9 @@
 #define __KERNEL_DS	(GDT_ENTRY_KERNEL_DS*8)
 #define __USER_DS	(GDT_ENTRY_DEFAULT_USER_DS*8+3)
 #define __USER_CS	(GDT_ENTRY_DEFAULT_USER_CS*8+3)
-#ifndef CONFIG_PARAVIRT
+#if defined(CONFIG_X86_XEN)
+#define get_kernel_rpl()  (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
+#elif !defined(CONFIG_PARAVIRT)
 #define get_kernel_rpl()  0
 #endif
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index cfd8144..8d30476 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,9 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#if defined(CONFIG_X86_XEN) && defined(CONFIG_CPU_SUP_AMD)
+#define TIF_CSTAR		31      /* cstar-based syscall (special handling) */
+#endif
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -116,6 +119,7 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_CSTAR		(1 << TIF_CSTAR)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -143,9 +147,13 @@ struct thread_info {
 	 _TIF_USER_RETURN_NOTIFY)
 
 /* flags to check in __switch_to() */
+#ifndef CONFIG_XEN
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
 
+#else
+#define _TIF_WORK_CTXSW (_TIF_NOTSC /*todo | _TIF_BLOCKSTEP */)
+#endif
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
 
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index b9676ae..7fb31d4 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -30,7 +30,7 @@
 #  define ENABLE_TOPO_DEFINES
 # endif
 #else
-# ifdef CONFIG_SMP
+# if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 #  define ENABLE_TOPO_DEFINES
 # endif
 #endif
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index feca311..0a55878 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -1,4 +1,4 @@
-#ifndef _ASM_X86_TRAMPOLINE_H
+#if !defined(_ASM_X86_TRAMPOLINE_H) && !defined(CONFIG_XEN)
 #define _ASM_X86_TRAMPOLINE_H
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0012d09..37da0d0 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -40,6 +40,9 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 #endif /* CONFIG_X86_MCE */
 asmlinkage void simd_coprocessor_error(void);
+#ifdef CONFIG_X86_XEN
+asmlinkage void fixup_4gb_segment(void);
+#endif
 
 dotraplinkage void do_divide_error(struct pt_regs *, long);
 dotraplinkage void do_debug(struct pt_regs *, long);
@@ -68,6 +71,9 @@ dotraplinkage void do_machine_check(struct pt_regs *, long);
 dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
 #ifdef CONFIG_X86_32
 dotraplinkage void do_iret_error(struct pt_regs *, long);
+#ifdef CONFIG_XEN
+void do_fixup_4gb_segment(struct pt_regs *, long);
+#endif
 #endif
 
 static inline int get_si_code(unsigned long condition)
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 21f7385..1c44df1 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -11,7 +11,7 @@
 #ifndef _ASM_X86_UV_UV_HUB_H
 #define _ASM_X86_UV_UV_HUB_H
 
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_UV
 #include <linux/numa.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 5728852..9219196 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -48,6 +48,7 @@
 #include <xen/interface/sched.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/platform.h>
+#include <xen/interface/tmem.h>
 
 /*
  * The hypercall asms have to meet several constraints:
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 66d0fff..41ff2bd 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -58,7 +58,7 @@ static inline uint32_t xen_cpuid_base(void)
 	return 0;
 }
 
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 extern bool xen_hvm_need_lapic(void);
 
 static inline bool xen_x2apic_para_available(void)
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index a1f2db5..40c95d2 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -10,17 +10,20 @@
 #define _ASM_X86_XEN_INTERFACE_H
 
 #ifdef __XEN__
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
     typedef struct { type *p; } __guest_handle_ ## name
 #else
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
     typedef type * __guest_handle_ ## name
 #endif
 
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
+    ___DEFINE_XEN_GUEST_HANDLE(name, type);   \
+    ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
 #define DEFINE_GUEST_HANDLE_STRUCT(name) \
-	__DEFINE_GUEST_HANDLE(name, struct name)
-#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
-#define GUEST_HANDLE(name)        __guest_handle_ ## name
+	__DEFINE_XEN_GUEST_HANDLE(name, struct name)
+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define XEN_GUEST_HANDLE(name)        __guest_handle_ ## name
 
 #ifdef __XEN__
 #if defined(__i386__)
@@ -47,15 +50,8 @@
 #endif
 
 #ifndef __ASSEMBLY__
-/* Guest handles for primitive C types. */
-__DEFINE_GUEST_HANDLE(uchar, unsigned char);
-__DEFINE_GUEST_HANDLE(uint,  unsigned int);
-__DEFINE_GUEST_HANDLE(ulong, unsigned long);
-DEFINE_GUEST_HANDLE(char);
-DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(long);
-DEFINE_GUEST_HANDLE(void);
-DEFINE_GUEST_HANDLE(uint64_t);
+typedef unsigned long xen_pfn_t;
+typedef unsigned long xen_ulong_t;
 #endif
 
 #ifndef HYPERVISOR_VIRT_START
@@ -67,7 +63,7 @@ DEFINE_GUEST_HANDLE(uint64_t);
 #define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
 
 /* Maximum number of virtual CPUs in multi-processor guests. */
-#define MAX_VIRT_CPUS 32
+#define XEN_LEGACY_MAX_VCPUS 32
 
 /*
  * SEGMENT DESCRIPTOR TABLES
diff --git a/arch/x86/include/mach-xen/asm/agp.h b/arch/x86/include/mach-xen/asm/agp.h
new file mode 100644
index 0000000..e2a122d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/agp.h
@@ -0,0 +1,58 @@
+#ifndef _ASM_X86_AGP_H
+#define _ASM_X86_AGP_H
+
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/system.h>
+
+/*
+ * Functions to keep the agpgart mappings coherent with the MMU. The
+ * GART gives the CPU a physical alias of pages in memory. The alias
+ * region is mapped uncacheable. Make sure there are no conflicting
+ * mappings with different cachability attributes for the same
+ * page. This avoids data corruption on some CPUs.
+ */
+
+#define map_page_into_agp(page) ( \
+	xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
+	?: set_pages_uc(page, 1))
+#define unmap_page_from_agp(page) ( \
+	xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
+	/* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
+	set_pages_wb(page, 1))
+
+#define map_pages_into_agp(pages, nr) ({ \
+	__typeof__(nr) n__; \
+	int rc__ = 0; \
+	for (n__ = 0; n__ < (nr) && !rc__; ++n__) \
+		rc__ = xen_create_contiguous_region( \
+			(unsigned long)page_address((pages)[n__]), 0, 32); \
+	rc__ ?: set_pages_array_uc(pages, nr); \
+})
+#define unmap_pages_from_agp(pages, nr) ({ \
+	__typeof__(nr) n__; \
+	for (n__ = 0; n__ < nr; ++n__) \
+		xen_destroy_contiguous_region( \
+			(unsigned long)page_address((pages)[n__]), 0); \
+	/* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
+	set_pages_array_wb(pages, nr); \
+})
+
+/*
+ * Could use CLFLUSH here if the cpu supports it. But then it would
+ * need to be called for each cacheline of the whole page so it may
+ * not be worth it. Would need a page for it.
+ */
+#define flush_agp_cache() wbinvd()
+
+#define virt_to_gart virt_to_machine
+
+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
+#define alloc_gatt_pages(order)	({                                          \
+	char *_t; dma_addr_t _d;                                            \
+	_t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL);    \
+	_t; })
+#define free_gatt_pages(table, order)	\
+	dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
+
+#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/mach-xen/asm/cmpxchg.h b/arch/x86/include/mach-xen/asm/cmpxchg.h
new file mode 100644
index 0000000..17fde1d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/cmpxchg.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_XEN_CMPXCHG_H
+#define _ASM_X86_XEN_CMPXCHG_H
+
+#include_next <asm/cmpxchg.h>
+#ifdef CONFIG_X86_32
+# include "cmpxchg_32.h"
+#else
+# include "cmpxchg_64.h"
+#endif
+
+#endif	/* _ASM_X86_XEN_CMPXCHG_H */
diff --git a/arch/x86/include/mach-xen/asm/cmpxchg_32.h b/arch/x86/include/mach-xen/asm/cmpxchg_32.h
new file mode 100644
index 0000000..9effb00
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/cmpxchg_32.h
@@ -0,0 +1,24 @@
+#ifndef _ASM_X86_XEN_CMPXCHG_32_H
+#define _ASM_X86_XEN_CMPXCHG_32_H
+
+static inline u64 get_64bit(const volatile u64 *ptr)
+{
+	u64 res;
+	__asm__("movl %%ebx,%%eax\n"
+		"movl %%ecx,%%edx\n"
+		LOCK_PREFIX "cmpxchg8b %1"
+		: "=&A" (res) : "m" (*ptr));
+	return res;
+}
+
+static inline u64 get_64bit_local(const volatile u64 *ptr)
+{
+	u64 res;
+	__asm__("movl %%ebx,%%eax\n"
+		"movl %%ecx,%%edx\n"
+		"cmpxchg8b %1"
+		: "=&A" (res) : "m" (*ptr));
+	return res;
+}
+
+#endif /* _ASM_X86_XEN_CMPXCHG_32_H */
diff --git a/arch/x86/include/mach-xen/asm/cmpxchg_64.h b/arch/x86/include/mach-xen/asm/cmpxchg_64.h
new file mode 100644
index 0000000..092b27b
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/cmpxchg_64.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_XEN_CMPXCHG_64_H
+#define _ASM_X86_XEN_CMPXCHG_64_H
+
+static inline u64 get_64bit(const volatile u64 *ptr)
+{
+	return *ptr;
+}
+
+#define get_64bit_local get_64bit
+
+#endif /* _ASM_X86_XEN_CMPXCHG_64_H */
diff --git a/arch/x86/include/mach-xen/asm/desc.h b/arch/x86/include/mach-xen/asm/desc.h
new file mode 100644
index 0000000..14862a0
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/desc.h
@@ -0,0 +1,433 @@
+#ifndef _ASM_X86_DESC_H
+#define _ASM_X86_DESC_H
+
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+#include <asm/mmu.h>
+
+#include <linux/smp.h>
+
+static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
+{
+	desc->limit0		= info->limit & 0x0ffff;
+
+	desc->base0		= (info->base_addr & 0x0000ffff);
+	desc->base1		= (info->base_addr & 0x00ff0000) >> 16;
+
+	desc->type		= (info->read_exec_only ^ 1) << 1;
+	desc->type	       |= info->contents << 2;
+
+	desc->s			= 1;
+	desc->dpl		= 0x3;
+	desc->p			= info->seg_not_present ^ 1;
+	desc->limit		= (info->limit & 0xf0000) >> 16;
+	desc->avl		= info->useable;
+	desc->d			= info->seg_32bit;
+	desc->g			= info->limit_in_pages;
+
+	desc->base2		= (info->base_addr & 0xff000000) >> 24;
+	/*
+	 * Don't allow setting of the lm bit. It would confuse
+	 * user_64bit_mode and would get overridden by sysret anyway.
+	 */
+	desc->l			= 0;
+}
+
+#ifndef CONFIG_X86_NO_IDT
+extern struct desc_ptr idt_descr;
+extern gate_desc idt_table[];
+extern struct desc_ptr nmi_idt_descr;
+extern gate_desc nmi_idt_table[];
+#endif
+
+struct gdt_page {
+	struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+	return per_cpu(gdt_page, cpu).gdt;
+}
+
+#ifdef CONFIG_X86_64
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+			     unsigned dpl, unsigned ist, unsigned seg)
+{
+	gate->offset_low	= PTR_LOW(func);
+	gate->segment		= __KERNEL_CS;
+	gate->ist		= ist;
+	gate->p			= 1;
+	gate->dpl		= dpl;
+	gate->zero0		= 0;
+	gate->zero1		= 0;
+	gate->type		= type;
+	gate->offset_middle	= PTR_MIDDLE(func);
+	gate->offset_high	= PTR_HIGH(func);
+}
+
+#else
+static inline void pack_gate(gate_desc *gate, unsigned char type,
+			     unsigned long base, unsigned dpl, unsigned flags,
+			     unsigned short seg)
+{
+	gate->a = (seg << 16) | (base & 0xffff);
+	gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+}
+
+#endif
+
+static inline int desc_empty(const void *ptr)
+{
+	const u32 *desc = ptr;
+
+	return !(desc[0] | desc[1]);
+}
+
+#ifndef CONFIG_XEN
+#define load_TR_desc()				native_load_tr_desc()
+#define load_gdt(dtr)				native_load_gdt(dtr)
+#define load_idt(dtr)				native_load_idt(dtr)
+#define load_tr(tr)				asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt)				asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr)				native_store_gdt(dtr)
+#define store_idt(dtr)				native_store_idt(dtr)
+#define store_tr(tr)				(tr = native_store_tr())
+
+#define load_TLS(t, cpu)			native_load_tls(t, cpu)
+#define set_ldt					native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc)	native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type)	native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g)		native_write_idt_entry(dt, entry, g)
+
+static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+
+static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+
+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
+static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
+{
+	memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
+{
+	memcpy(&ldt[entry], desc, 8);
+}
+
+static inline void
+native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
+{
+	unsigned int size;
+
+	switch (type) {
+	case DESC_TSS:	size = sizeof(tss_desc);	break;
+	case DESC_LDT:	size = sizeof(ldt_desc);	break;
+	default:	size = sizeof(*gdt);		break;
+	}
+
+	memcpy(&gdt[entry], desc, size);
+}
+#endif
+
+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
+				   unsigned long limit, unsigned char type,
+				   unsigned char flags)
+{
+	desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+	desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+		(limit & 0x000f0000) | ((type & 0xff) << 8) |
+		((flags & 0xf) << 20);
+	desc->p = 1;
+}
+
+
+#ifndef CONFIG_XEN
+static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
+{
+#ifdef CONFIG_X86_64
+	struct ldttss_desc64 *desc = d;
+
+	memset(desc, 0, sizeof(*desc));
+
+	desc->limit0		= size & 0xFFFF;
+	desc->base0		= PTR_LOW(addr);
+	desc->base1		= PTR_MIDDLE(addr) & 0xFF;
+	desc->type		= type;
+	desc->p			= 1;
+	desc->limit1		= (size >> 16) & 0xF;
+	desc->base2		= (PTR_MIDDLE(addr) >> 8) & 0xFF;
+	desc->base3		= PTR_HIGH(addr);
+#else
+	pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+#endif
+}
+
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+{
+	struct desc_struct *d = get_cpu_gdt_table(cpu);
+	tss_desc tss;
+
+	/*
+	 * sizeof(unsigned long) coming from an extra "long" at the end
+	 * of the iobitmap. See tss_struct definition in processor.h
+	 *
+	 * -1? seg base+limit should be pointing to the address of the
+	 * last valid byte
+	 */
+	set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+			      IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
+			      sizeof(unsigned long) - 1);
+	write_gdt_entry(d, entry, &tss, DESC_TSS);
+}
+
+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void native_set_ldt(const void *addr, unsigned int entries)
+{
+	if (likely(entries == 0))
+		asm volatile("lldt %w0"::"q" (0));
+	else {
+		unsigned cpu = smp_processor_id();
+		ldt_desc ldt;
+
+		set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
+				      entries * LDT_ENTRY_SIZE - 1);
+		write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+				&ldt, DESC_LDT);
+		asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+	}
+}
+
+static inline void native_load_tr_desc(void)
+{
+	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+	asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+	asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+	asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+	asm volatile("sidt %0":"=m" (*dtr));
+}
+
+static inline unsigned long native_store_tr(void)
+{
+	unsigned long tr;
+
+	asm volatile("str %0":"=r" (tr));
+
+	return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	unsigned int i;
+
+	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+		gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
+#else
+#include <asm/pgtable.h>
+
+#define load_TLS(t, cpu) xen_load_tls(t, cpu)
+#define set_ldt xen_set_ldt
+
+extern int write_ldt_entry(struct desc_struct *ldt, int entry,
+			   const void *desc);
+extern int write_gdt_entry(struct desc_struct *gdt, int entry,
+			   const void *desc, int type);
+
+static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	unsigned int i;
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
+
+	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+		if (HYPERVISOR_update_descriptor(
+				arbitrary_virt_to_machine(&gdt[i]),
+				*(u64 *)&t->tls_array[i]))
+			BUG();
+}
+#endif
+
+#define _LDT_empty(info)				\
+	((info)->base_addr		== 0	&&	\
+	 (info)->limit			== 0	&&	\
+	 (info)->contents		== 0	&&	\
+	 (info)->read_exec_only		== 1	&&	\
+	 (info)->seg_32bit		== 0	&&	\
+	 (info)->limit_in_pages		== 0	&&	\
+	 (info)->seg_not_present	== 1	&&	\
+	 (info)->useable		== 0)
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) (_LDT_empty(info))
+#endif
+
+static inline void clear_LDT(void)
+{
+	set_ldt(NULL, 0);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT_nolock(mm_context_t *pc)
+{
+	set_ldt(pc->ldt, pc->size);
+}
+
+static inline void load_LDT(mm_context_t *pc)
+{
+	preempt_disable();
+	load_LDT_nolock(pc);
+	preempt_enable();
+}
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+	return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+}
+
+static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
+{
+	desc->base0 = base & 0xffff;
+	desc->base1 = (base >> 16) & 0xff;
+	desc->base2 = (base >> 24) & 0xff;
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+	return desc->limit0 | (desc->limit << 16);
+}
+
+static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
+{
+	desc->limit0 = limit & 0xffff;
+	desc->limit = (limit >> 16) & 0xf;
+}
+
+#ifndef CONFIG_X86_NO_IDT
+#ifdef CONFIG_X86_64
+static inline void set_nmi_gate(int gate, void *addr)
+{
+	gate_desc s;
+
+	pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
+	write_idt_entry(nmi_idt_table, gate, &s);
+}
+#endif
+
+static inline void _set_gate(int gate, unsigned type, void *addr,
+			     unsigned dpl, unsigned ist, unsigned seg)
+{
+	gate_desc s;
+
+	pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+	/*
+	 * does not need to be atomic because it is only done once at
+	 * setup time
+	 */
+	write_idt_entry(idt_table, gate, &s);
+}
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+static inline void set_intr_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+}
+
+extern int first_system_vector;
+/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
+extern unsigned long used_vectors[];
+
+static inline void alloc_system_vector(int vector)
+{
+	if (!test_bit(vector, used_vectors)) {
+		set_bit(vector, used_vectors);
+		if (first_system_vector > vector)
+			first_system_vector = vector;
+	} else {
+		BUG();
+	}
+}
+
+static inline void alloc_intr_gate(unsigned int n, void *addr)
+{
+	alloc_system_vector(n);
+	set_intr_gate(n, addr);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_system_trap_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_trap_gate(unsigned int n, void *addr)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+}
+
+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+}
+
+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+}
+
+static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+	BUG_ON((unsigned)n > 0xFF);
+	_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+}
+#endif
+
+#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/mach-xen/asm/dma-mapping.h b/arch/x86/include/mach-xen/asm/dma-mapping.h
new file mode 100644
index 0000000..5054274
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/dma-mapping.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_X86_DMA_MAPPING_H_
+
+#define phys_to_dma _phys_to_dma_
+#define dma_to_phys _dma_to_phys_
+
+#include_next <asm/dma-mapping.h>
+
+#undef phys_to_dma
+#undef dma_to_phys
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+	return phys_to_machine(paddr);
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+	return machine_to_phys(daddr);
+}
+
+void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t);
+
+extern int range_straddles_page_boundary(paddr_t p, size_t size);
+
+#endif /* _ASM_X86_DMA_MAPPING_H_ */
diff --git a/arch/x86/include/mach-xen/asm/fixmap.h b/arch/x86/include/mach-xen/asm/fixmap.h
new file mode 100644
index 0000000..dccdd97
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/fixmap.h
@@ -0,0 +1,240 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+ */
+
+#ifndef _ASM_X86_FIXMAP_H
+#define _ASM_X86_FIXMAP_H
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/page.h>
+#ifdef CONFIG_X86_32
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#else
+#include <asm/vsyscall.h>
+#endif
+
+/*
+ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
+ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
+ * Because of this, FIXADDR_TOP x86 integration was left as later work.
+ */
+#ifdef CONFIG_X86_32
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_TOP	((unsigned long)__FIXADDR_TOP)
+
+#define FIXADDR_USER_START     __fix_to_virt(FIX_VDSO)
+#define FIXADDR_USER_END       __fix_to_virt(FIX_VDSO - 1)
+#else
+#define FIXADDR_TOP	(VSYSCALL_END-PAGE_SIZE)
+
+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+#define FIXADDR_USER_START	((unsigned long)VSYSCALL32_VSYSCALL)
+#define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)
+#endif
+
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+#ifdef CONFIG_X86_32
+	FIX_HOLE,
+	FIX_VDSO,
+#else
+	VSYSCALL_LAST_PAGE,
+	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+	VVAR_PAGE,
+	VSYSCALL_HPET,
+#endif
+	FIX_DBGP_BASE,
+	FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	FIX_OHCI1394_BASE,
+#endif
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_LOCAL_APIC
+	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+	FIX_IO_APIC_BASE_0,
+	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#else
+	FIX_SHARED_INFO,
+#define NR_FIX_ISAMAPS	256
+	FIX_ISAMAP_END,
+	FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+	FIX_CO_CPU,	/* Cobalt timer */
+	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
+	FIX_LI_PCIA,	/* Lithium PCI Bridge A */
+	FIX_LI_PCIB,	/* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+	FIX_F00F_IDT,	/* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+	FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif
+#ifdef CONFIG_X86_32
+	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
+	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+	FIX_PCIE_MCFG,
+#endif
+#endif
+#ifdef CONFIG_PARAVIRT
+	FIX_PARAVIRT_BOOTMAP,
+#endif
+	FIX_TEXT_POKE1,	/* reserve 2 pages for text_poke() */
+	FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
+#ifdef	CONFIG_X86_INTEL_MID
+	FIX_LNW_VRTC,
+#endif
+	__end_of_permanent_fixed_addresses,
+
+	/*
+	 * 256 temporary boot-time mappings, used by early_ioremap(),
+	 * before ioremap() is functional.
+	 *
+	 * If necessary we round it up to the next 256 pages boundary so
+	 * that we can have a single pgd entry and a single pte table:
+	 */
+#define NR_FIX_BTMAPS		64
+#define FIX_BTMAPS_SLOTS	4
+#define TOTAL_FIX_BTMAPS	(NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
+	FIX_BTMAP_END =
+	 (__end_of_permanent_fixed_addresses ^
+	  (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
+	 -PTRS_PER_PTE
+	 ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
+	   (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
+	 : __end_of_permanent_fixed_addresses,
+	FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
+#ifdef CONFIG_X86_32
+	FIX_WP_TEST,
+#endif
+#ifdef CONFIG_INTEL_TXT
+	FIX_TBOOT_BASE,
+#endif
+	__end_of_fixed_addresses
+};
+
+
+extern void reserve_top_address(unsigned long reserve);
+
+#define FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_BOOT_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START		(FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_BOOT_START	(FIXADDR_TOP - FIXADDR_BOOT_SIZE)
+
+extern int fixmaps_set;
+
+extern pte_t *kmap_pte;
+extern pgprot_t kmap_prot;
+extern pte_t *pkmap_page_table;
+
+void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t);
+
+static inline void __set_fixmap(enum fixed_addresses idx,
+				phys_addr_t phys, pgprot_t flags)
+{
+	xen_set_fixmap(idx, phys, flags);
+}
+
+#define set_fixmap(idx, phys)				\
+	__set_fixmap(idx, phys, PAGE_KERNEL)
+
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys)			\
+	__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define clear_fixmap(idx)			\
+	__set_fixmap(idx, 0, __pgprot(0))
+
+#define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+	/*
+	 * this branch gets completely eliminated after inlining,
+	 * except when someone tries to use fixaddr indices in an
+	 * illegal way. (such as mixing up address types or using
+	 * out-of-range indices).
+	 *
+	 * If it doesn't get removed, the linker will complain
+	 * loudly with a reasonably clear error message..
+	 */
+	if (idx >= __end_of_fixed_addresses)
+		__this_fixmap_does_not_exist();
+
+	return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+	return __virt_to_fix(vaddr);
+}
+
+/* Return an pointer with offset calculated */
+static __always_inline unsigned long
+__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+{
+	__set_fixmap(idx, phys, flags);
+	return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
+}
+
+#define set_fixmap_offset(idx, phys)			\
+	__set_fixmap_offset(idx, phys, PAGE_KERNEL)
+
+#define set_fixmap_offset_nocache(idx, phys)			\
+	__set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/mach-xen/asm/gnttab_dma.h b/arch/x86/include/mach-xen/asm/gnttab_dma.h
new file mode 100644
index 0000000..fd7197c
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/gnttab_dma.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _ASM_I386_GNTTAB_DMA_H
+#define _ASM_I386_GNTTAB_DMA_H
+
+static inline int gnttab_dma_local_pfn(struct page *page)
+{
+	/* Has it become a local MFN? */
+	return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page))));
+}
+
+static inline maddr_t gnttab_dma_map_page(struct page *page)
+{
+	__gnttab_dma_map_page(page);
+	return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT);
+}
+
+static inline void gnttab_dma_unmap_page(maddr_t maddr)
+{
+	__gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr)));
+}
+
+#endif /* _ASM_I386_GNTTAB_DMA_H */
diff --git a/arch/x86/include/mach-xen/asm/highmem.h b/arch/x86/include/mach-xen/asm/highmem.h
new file mode 100644
index 0000000..1243d04
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/highmem.h
@@ -0,0 +1,98 @@
+/*
+ * highmem.h: virtual kernel memory mappings for high memory
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ *		      Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#ifndef _ASM_X86_HIGHMEM_H
+#define _ASM_X86_HIGHMEM_H
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+
+/* declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+/*
+ * Ordering is:
+ *
+ * FIXADDR_TOP
+ * 			fixed_addresses
+ * FIXADDR_START
+ * 			temp fixed addresses
+ * FIXADDR_BOOT_START
+ * 			Persistent kmap area
+ * PKMAP_BASE
+ * VMALLOC_END
+ * 			Vmalloc area
+ * VMALLOC_START
+ * high_memory
+ */
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
+
+void *kmap(struct page *page);
+void kunmap(struct page *page);
+
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
+void *__kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
+struct page *kmap_atomic_to_page(void *ptr);
+
+#define kmap_atomic_pte(page) \
+	kmap_atomic_prot(page, \
+	                 PagePinned(page) ? PAGE_KERNEL_RO : kmap_prot)
+
+#define flush_cache_kmaps()	do { } while (0)
+
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
+
+void clear_highpage(struct page *);
+static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
+{
+	clear_highpage(page);
+}
+#define __HAVE_ARCH_CLEAR_HIGHPAGE
+#define clear_user_highpage clear_user_highpage
+#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
+
+void copy_highpage(struct page *to, struct page *from);
+static inline void copy_user_highpage(struct page *to, struct page *from,
+	unsigned long vaddr, struct vm_area_struct *vma)
+{
+	copy_highpage(to, from);
+}
+#define __HAVE_ARCH_COPY_HIGHPAGE
+#define __HAVE_ARCH_COPY_USER_HIGHPAGE
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_HIGHMEM_H */
diff --git a/arch/x86/include/mach-xen/asm/hypercall.h b/arch/x86/include/mach-xen/asm/hypercall.h
new file mode 100644
index 0000000..573ce8d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall.h
@@ -0,0 +1,439 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * 64-bit updates:
+ *   Benjamin Liu <benjamin.liu@intel.com>
+ *   Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERCALL_H__
+#define __HYPERCALL_H__
+
+#ifndef __HYPERVISOR_H__
+# error "please don't include this file directly"
+#endif
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+# include <xen/interface/platform.h>
+# include <xen/interface/arch-x86/xen-mca.h>
+#endif
+#if CONFIG_XEN_COMPAT <= 0x030002
+# include <linux/string.h> /* memcpy() */
+# include <xen/interface/event_channel.h>
+# include <xen/interface/physdev.h>
+#endif
+
+#ifdef CONFIG_XEN
+#define HYPERCALL_ASM_OPERAND "%c"
+#define HYPERCALL_LOCATION(op) (hypercall_page + (op) * 32)
+#define HYPERCALL_C_OPERAND(name) "i" (HYPERCALL_LOCATION(__HYPERVISOR_##name))
+#else
+#define HYPERCALL_ASM_OPERAND "*%"
+#define HYPERCALL_LOCATION(op) (hypercall_stubs + (op) * 32)
+#define HYPERCALL_C_OPERAND(name) "g" (HYPERCALL_LOCATION(__HYPERVISOR_##name))
+#endif
+
+#define HYPERCALL_ARG(arg, n) \
+	register typeof((arg)+0) __arg##n asm(HYPERCALL_arg##n) = (arg)
+
+#define _hypercall0(type, name)					\
+({								\
+	type __res;						\
+	asm volatile (						\
+		"call " HYPERCALL_ASM_OPERAND "1"		\
+		: "=a" (__res)					\
+		: HYPERCALL_C_OPERAND(name)			\
+		: "memory" );					\
+	__res;							\
+})
+
+#define _hypercall1(type, name, arg)				\
+({								\
+	type __res;						\
+	HYPERCALL_ARG(arg, 1);					\
+	asm volatile (						\
+		"call " HYPERCALL_ASM_OPERAND "2"		\
+		: "=a" (__res), "+r" (__arg1)			\
+		: HYPERCALL_C_OPERAND(name)			\
+		: "memory" );					\
+	__res;							\
+})
+
+#define _hypercall2(type, name, a1, a2)				\
+({								\
+	type __res;						\
+	HYPERCALL_ARG(a1, 1);					\
+	HYPERCALL_ARG(a2, 2);					\
+	asm volatile (						\
+		"call " HYPERCALL_ASM_OPERAND "3"		\
+		: "=a" (__res), "+r" (__arg1), "+r" (__arg2)	\
+		: HYPERCALL_C_OPERAND(name)			\
+		: "memory" );					\
+	__res;							\
+})
+
+#define _hypercall3(type, name, a1, a2, a3)			\
+({								\
+	type __res;						\
+	HYPERCALL_ARG(a1, 1);					\
+	HYPERCALL_ARG(a2, 2);					\
+	HYPERCALL_ARG(a3, 3);					\
+	asm volatile (						\
+		"call " HYPERCALL_ASM_OPERAND "4"		\
+		: "=a" (__res), "+r" (__arg1),			\
+		  "+r" (__arg2), "+r" (__arg3)			\
+		: HYPERCALL_C_OPERAND(name)			\
+		: "memory" );					\
+	__res;							\
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4)			\
+({								\
+	type __res;						\
+	HYPERCALL_ARG(a1, 1);					\
+	HYPERCALL_ARG(a2, 2);					\
+	HYPERCALL_ARG(a3, 3);					\
+	HYPERCALL_ARG(a4, 4);					\
+	asm volatile (						\
+		"call " HYPERCALL_ASM_OPERAND "5"		\
+		: "=a" (__res), "+r" (__arg1), "+r" (__arg2),	\
+		  "+r" (__arg3), "+r" (__arg4)			\
+		: HYPERCALL_C_OPERAND(name)			\
+		: "memory" );					\
+	__res;							\
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5)		\
+({								\
+	type __res;						\
+	HYPERCALL_ARG(a1, 1);					\
+	HYPERCALL_ARG(a2, 2);					\
+	HYPERCALL_ARG(a3, 3);					\
+	HYPERCALL_ARG(a4, 4);					\
+	HYPERCALL_ARG(a5, 5);					\
+	asm volatile (						\
+		"call " HYPERCALL_ASM_OPERAND "6"		\
+		: "=a" (__res), "+r" (__arg1), "+r" (__arg2),	\
+		  "+r" (__arg3), "+r" (__arg4), "+r" (__arg5)	\
+		: HYPERCALL_C_OPERAND(name)			\
+		: "memory" );					\
+	__res;							\
+})
+
+#define _hypercall(type, op, a1, a2, a3, a4, a5)		\
+({								\
+	type __res;						\
+	HYPERCALL_ARG(a1, 1);					\
+	HYPERCALL_ARG(a2, 2);					\
+	HYPERCALL_ARG(a3, 3);					\
+	HYPERCALL_ARG(a4, 4);					\
+	HYPERCALL_ARG(a5, 5);					\
+	asm volatile (						\
+		"call *%6"					\
+		: "=a" (__res), "+r" (__arg1), "+r" (__arg2),	\
+		  "+r" (__arg3), "+r" (__arg4), "+r" (__arg5)	\
+		: "g" (HYPERCALL_LOCATION(op))			\
+		: "memory" );					\
+	__res;							\
+})
+
+#ifdef CONFIG_X86_32
+# include "hypercall_32.h"
+#else
+# include "hypercall_64.h"
+#endif
+
+static inline int __must_check
+HYPERVISOR_set_trap_table(
+	const trap_info_t *table)
+{
+	return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int __must_check
+HYPERVISOR_mmu_update(
+	mmu_update_t *req, unsigned int count, unsigned int *success_count,
+	domid_t domid)
+{
+	if (arch_use_lazy_mmu_mode())
+		return xen_multi_mmu_update(req, count, success_count, domid);
+	return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_mmuext_op(
+	struct mmuext_op *op, unsigned int count, unsigned int *success_count,
+	domid_t domid)
+{
+	if (arch_use_lazy_mmu_mode())
+		return xen_multi_mmuext_op(op, count, success_count, domid);
+	return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_gdt(
+	unsigned long *frame_list, unsigned int entries)
+{
+	return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int __must_check
+HYPERVISOR_stack_switch(
+	unsigned long ss, unsigned long esp)
+{
+	return _hypercall2(int, stack_switch, ss, esp);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(
+	int set)
+{
+	return _hypercall1(int, fpu_taskswitch, set);
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int __must_check
+HYPERVISOR_sched_op_compat(
+	int cmd, unsigned long arg)
+{
+	return _hypercall2(int, sched_op_compat, cmd, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_sched_op(
+	int cmd, void *arg)
+{
+	return _hypercall2(int, sched_op, cmd, arg);
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static inline int __must_check
+HYPERVISOR_platform_op(
+	struct xen_platform_op *platform_op)
+{
+	platform_op->interface_version = XENPF_INTERFACE_VERSION;
+	return _hypercall1(int, platform_op, platform_op);
+}
+
+static inline int __must_check
+HYPERVISOR_mca(
+	struct xen_mc *mc_op)
+{
+	mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+	return _hypercall1(int, mca, mc_op);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_set_debugreg(
+	unsigned int reg, unsigned long value)
+{
+	return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long __must_check
+HYPERVISOR_get_debugreg(
+	unsigned int reg)
+{
+	return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int __must_check
+HYPERVISOR_memory_op(
+	unsigned int cmd, void *arg)
+{
+	if (arch_use_lazy_mmu_mode())
+		xen_multicall_flush();
+	return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_multicall(
+	multicall_entry_t *call_list, unsigned int nr_calls)
+{
+	return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int __must_check
+HYPERVISOR_event_channel_op(
+	int cmd, void *arg)
+{
+	int rc = _hypercall2(int, event_channel_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (unlikely(rc == -ENOSYS)) {
+		struct evtchn_op op;
+		op.cmd = cmd;
+		memcpy(&op.u, arg, sizeof(op.u));
+		rc = _hypercall1(int, event_channel_op_compat, &op);
+		memcpy(arg, &op.u, sizeof(op.u));
+	}
+#endif
+
+	return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_xen_version(
+	int cmd, void *arg)
+{
+	return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_console_io(
+	int cmd, unsigned int count, char *str)
+{
+	return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int __must_check
+HYPERVISOR_physdev_op(
+	int cmd, void *arg)
+{
+	int rc = _hypercall2(int, physdev_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (unlikely(rc == -ENOSYS)) {
+		struct physdev_op op;
+		op.cmd = cmd;
+		memcpy(&op.u, arg, sizeof(op.u));
+		rc = _hypercall1(int, physdev_op_compat, &op);
+		memcpy(arg, &op.u, sizeof(op.u));
+	}
+#endif
+
+	return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_grant_table_op(
+	unsigned int cmd, void *uop, unsigned int count)
+{
+	bool fixup = false;
+	int rc;
+
+	if (arch_use_lazy_mmu_mode())
+		xen_multicall_flush();
+#ifdef GNTTABOP_map_grant_ref
+	if (cmd == GNTTABOP_map_grant_ref)
+#endif
+		fixup = gnttab_pre_map_adjust(cmd, uop, count);
+	rc = _hypercall3(int, grant_table_op, cmd, uop, count);
+	if (rc == 0 && fixup)
+		rc = gnttab_post_map_adjust(uop, count);
+	return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_vm_assist(
+	unsigned int cmd, unsigned int type)
+{
+	return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int __must_check
+HYPERVISOR_vcpu_op(
+	int cmd, unsigned int vcpuid, void *extra_args)
+{
+	return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+static inline int __must_check
+HYPERVISOR_suspend(
+	unsigned long srec)
+{
+	struct sched_shutdown sched_shutdown = {
+		.reason = SHUTDOWN_suspend
+	};
+
+	int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
+			     &sched_shutdown, srec);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (rc == -ENOSYS)
+		rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
+				 SHUTDOWN_suspend, srec);
+#endif
+
+	return rc;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int
+HYPERVISOR_nmi_op(
+	unsigned long op, void *arg)
+{
+	return _hypercall2(int, nmi_op, op, arg);
+}
+#endif
+
+#ifndef CONFIG_XEN
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(
+    int op, void *arg)
+{
+    return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_callback_op(
+	int cmd, const void *arg)
+{
+	return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_xenoprof_op(
+	int op, void *arg)
+{
+	return _hypercall2(int, xenoprof_op, op, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_kexec_op(
+	unsigned long op, void *args)
+{
+	return _hypercall2(int, kexec_op, op, args);
+}
+
+struct tmem_op;
+
+static inline int __must_check
+HYPERVISOR_tmem_op(
+	struct tmem_op *op)
+{
+	return _hypercall1(int, tmem_op, (void *)op);
+}
+
+#endif /* __HYPERCALL_H__ */
diff --git a/arch/x86/include/mach-xen/asm/hypercall_32.h b/arch/x86/include/mach-xen/asm/hypercall_32.h
new file mode 100644
index 0000000..3987b2e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall_32.h
@@ -0,0 +1,62 @@
+#define HYPERCALL_arg1 "ebx"
+#define HYPERCALL_arg2 "ecx"
+#define HYPERCALL_arg3 "edx"
+#define HYPERCALL_arg4 "esi"
+#define HYPERCALL_arg5 "edi"
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+	unsigned long event_selector, unsigned long event_address,
+	unsigned long failsafe_selector, unsigned long failsafe_address)
+{
+	return _hypercall4(int, set_callbacks,
+			   event_selector, event_address,
+			   failsafe_selector, failsafe_address);
+}
+#endif
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+	u64 timeout)
+{
+	return _hypercall2(long, set_timer_op,
+			   (unsigned long)timeout,
+			   (unsigned long)(timeout>>32));
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+	u64 ma, u64 desc)
+{
+	return _hypercall4(int, update_descriptor,
+			   (unsigned long)ma, (unsigned long)(ma>>32),
+			   (unsigned long)desc, (unsigned long)(desc>>32));
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+	unsigned long va, pte_t new_val, unsigned long flags)
+{
+	unsigned long pte_hi = 0;
+
+	if (arch_use_lazy_mmu_mode())
+		return xen_multi_update_va_mapping(va, new_val, flags);
+#ifdef CONFIG_X86_PAE
+	pte_hi = new_val.pte_high;
+#endif
+	return _hypercall4(int, update_va_mapping, va,
+			   new_val.pte_low, pte_hi, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+	unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+	unsigned long pte_hi = 0;
+#ifdef CONFIG_X86_PAE
+	pte_hi = new_val.pte_high;
+#endif
+	return _hypercall5(int, update_va_mapping_otherdomain, va,
+			   new_val.pte_low, pte_hi, flags, domid);
+}
diff --git a/arch/x86/include/mach-xen/asm/hypercall_64.h b/arch/x86/include/mach-xen/asm/hypercall_64.h
new file mode 100644
index 0000000..97d9445
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall_64.h
@@ -0,0 +1,54 @@
+#define HYPERCALL_arg1 "rdi"
+#define HYPERCALL_arg2 "rsi"
+#define HYPERCALL_arg3 "rdx"
+#define HYPERCALL_arg4 "r10"
+#define HYPERCALL_arg5 "r8"
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+	unsigned long event_address, unsigned long failsafe_address, 
+	unsigned long syscall_address)
+{
+	return _hypercall3(int, set_callbacks,
+			   event_address, failsafe_address, syscall_address);
+}
+#endif
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+	u64 timeout)
+{
+	return _hypercall1(long, set_timer_op, timeout);
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+	unsigned long ma, unsigned long word)
+{
+	return _hypercall2(int, update_descriptor, ma, word);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+	unsigned long va, pte_t new_val, unsigned long flags)
+{
+	if (arch_use_lazy_mmu_mode())
+		return xen_multi_update_va_mapping(va, new_val, flags);
+	return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+	unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+	return _hypercall4(int, update_va_mapping_otherdomain, va,
+			   new_val.pte, flags, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_segment_base(
+	int reg, unsigned long value)
+{
+	return _hypercall2(int, set_segment_base, reg, value);
+}
diff --git a/arch/x86/include/mach-xen/asm/hypervisor.h b/arch/x86/include/mach-xen/asm/hypervisor.h
new file mode 100644
index 0000000..b15cced
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypervisor.h
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * hypervisor.h
+ * 
+ * Linux-specific hypervisor handling.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERVISOR_H__
+#define __HYPERVISOR_H__
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+#include <asm/percpu.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable_types.h>
+
+extern shared_info_t *HYPERVISOR_shared_info;
+
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+DECLARE_PER_CPU(struct vcpu_info, vcpu_info);
+#define vcpu_info(cpu) (&per_cpu(vcpu_info, cpu))
+#define current_vcpu_info() (&__get_cpu_var(vcpu_info))
+#define vcpu_info_read(fld) percpu_read(vcpu_info.fld)
+#define vcpu_info_write(fld, val) percpu_write(vcpu_info.fld, val)
+#define vcpu_info_xchg(fld, val) percpu_xchg(vcpu_info.fld, val)
+void setup_vcpu_info(unsigned int cpu);
+void adjust_boot_vcpu_info(void);
+#else
+#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu))
+#ifdef CONFIG_SMP
+#define current_vcpu_info() vcpu_info(smp_processor_id())
+#else
+#define current_vcpu_info() vcpu_info(0)
+#endif
+#define vcpu_info_read(fld) (current_vcpu_info()->fld)
+#define vcpu_info_write(fld, val) (current_vcpu_info()->fld = (val))
+static inline void setup_vcpu_info(unsigned int cpu) {}
+#endif
+
+#ifdef CONFIG_X86_32
+extern unsigned long hypervisor_virt_start;
+#endif
+
+/* arch/xen/i386/kernel/setup.c */
+extern start_info_t *xen_start_info;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
+#else
+#define is_initial_xendomain() 0
+#endif
+
+#define init_hypervisor(c) ((void)(c))
+#define init_hypervisor_platform() init_hypervisor(&boot_cpu_data)
+
+DECLARE_PER_CPU(struct vcpu_runstate_info, runstate);
+#define vcpu_running(cpu) (per_cpu(runstate.state, cpu) == RUNSTATE_running)
+
+/* arch/xen/kernel/evtchn.c */
+/* Force a proper event-channel callback from Xen. */
+void force_evtchn_callback(void);
+
+/* arch/xen/kernel/process.c */
+void xen_cpu_idle (void);
+
+/* arch/xen/i386/kernel/hypervisor.c */
+void do_hypervisor_callback(struct pt_regs *regs);
+
+/* arch/xen/i386/mm/hypervisor.c */
+/*
+ * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
+ * be MACHINE addresses.
+ */
+
+void xen_pt_switch(pgd_t *);
+void xen_new_user_pt(pgd_t *); /* x86_64 only */
+void xen_load_gs(unsigned int selector); /* x86_64 only */
+void xen_tlb_flush(void);
+void xen_invlpg(unsigned long ptr);
+
+void xen_l1_entry_update(pte_t *ptr, pte_t val);
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
+void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
+void xen_pgd_pin(pgd_t *);
+void xen_pgd_unpin(pgd_t *);
+
+void xen_init_pgd_pin(void);
+#ifdef CONFIG_PM_SLEEP
+void setup_pfn_to_mfn_frame_list(void *(*)(unsigned long, unsigned long,
+					   unsigned long));
+#endif
+
+void xen_set_ldt(const void *ptr, unsigned int ents);
+
+#ifdef CONFIG_SMP
+#include <linux/cpumask.h>
+void xen_tlb_flush_all(void);
+void xen_invlpg_all(unsigned long ptr);
+void xen_tlb_flush_mask(const cpumask_t *mask);
+void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr);
+#else
+#define xen_tlb_flush_all xen_tlb_flush
+#define xen_invlpg_all xen_invlpg
+#endif
+
+/* Returns zero on success else negative errno. */
+int xen_create_contiguous_region(
+    unsigned long vstart, unsigned int order, unsigned int address_bits);
+void xen_destroy_contiguous_region(
+    unsigned long vstart, unsigned int order);
+int early_create_contiguous_region(unsigned long pfn, unsigned int order,
+				   unsigned int address_bits);
+
+struct page;
+
+int xen_limit_pages_to_max_mfn(
+	struct page *pages, unsigned int order, unsigned int address_bits);
+
+bool __cold hypervisor_oom(void);
+
+/* Turn jiffies into Xen system time. */
+u64 jiffies_to_st(unsigned long jiffies);
+
+#ifdef CONFIG_XEN_SCRUB_PAGES
+void scrub_pages(void *, unsigned int);
+#else
+#define scrub_pages(_p,_n) ((void)0)
+#endif
+
+#if defined(CONFIG_XEN) && !defined(MODULE)
+
+DECLARE_PER_CPU(bool, xen_lazy_mmu);
+
+void xen_multicall_flush(void);
+
+int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t,
+					     unsigned long flags);
+int __must_check xen_multi_mmu_update(mmu_update_t *, unsigned int count,
+				      unsigned int *success_count, domid_t);
+int __must_check xen_multi_mmuext_op(struct mmuext_op *, unsigned int count,
+				     unsigned int *success_count, domid_t);
+
+#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+	percpu_write(xen_lazy_mmu, true);
+}
+
+static inline void arch_leave_lazy_mmu_mode(void)
+{
+	percpu_write(xen_lazy_mmu, false);
+	xen_multicall_flush();
+}
+
+#define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu))
+
+#if 0 /* All uses are in places potentially called asynchronously, but
+       * asynchronous code should rather not make use of lazy mode at all.
+       * Therefore, all uses of this function get commented out, proper
+       * detection of asynchronous invocations is added whereever needed,
+       * and this function is disabled to catch any new (improper) uses.
+       */
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+	if (arch_use_lazy_mmu_mode())
+		xen_multicall_flush();
+}
+#endif
+
+#else /* !CONFIG_XEN || MODULE */
+
+static inline void xen_multicall_flush(void) {}
+#define arch_use_lazy_mmu_mode() false
+#define xen_multi_update_va_mapping(...) ({ BUG(); -ENOSYS; })
+#define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; })
+#define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; })
+
+#endif /* CONFIG_XEN && !MODULE */
+
+#ifdef CONFIG_XEN
+
+struct gnttab_map_grant_ref;
+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
+			   unsigned int count);
+#if CONFIG_XEN_COMPAT < 0x030400
+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
+#else
+static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
+					 unsigned int count)
+{
+	BUG();
+	return -ENOSYS;
+}
+#endif
+
+#else /* !CONFIG_XEN */
+
+#define gnttab_pre_map_adjust(...) false
+#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
+
+#endif /* CONFIG_XEN */
+
+#if defined(CONFIG_X86_64)
+#define MULTI_UVMFLAGS_INDEX 2
+#define MULTI_UVMDOMID_INDEX 3
+#else
+#define MULTI_UVMFLAGS_INDEX 3
+#define MULTI_UVMDOMID_INDEX 4
+#endif
+
+#ifdef CONFIG_XEN
+#define is_running_on_xen() 1
+extern char hypercall_page[PAGE_SIZE];
+#else
+extern char *hypercall_stubs;
+#define is_running_on_xen() (!!hypercall_stubs)
+#endif
+
+#include <xen/hypercall.h>
+
+static inline int
+HYPERVISOR_yield(
+	void)
+{
+	int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (rc == -ENOSYS)
+		rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+	return rc;
+}
+
+static inline int
+HYPERVISOR_block(
+	void)
+{
+	int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (rc == -ENOSYS)
+		rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
+#endif
+
+	return rc;
+}
+
+static inline void __noreturn
+HYPERVISOR_shutdown(
+	unsigned int reason)
+{
+	struct sched_shutdown sched_shutdown = {
+		.reason = reason
+	};
+
+	VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown));
+#if CONFIG_XEN_COMPAT <= 0x030002
+	VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason));
+#endif
+	/* Don't recurse needlessly. */
+	BUG_ON(reason != SHUTDOWN_crash);
+	for(;;);
+}
+
+static inline int __must_check
+HYPERVISOR_poll(
+	evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
+{
+	int rc;
+	struct sched_poll sched_poll = {
+		.nr_ports = nr_ports,
+		.timeout = jiffies_to_st(timeout)
+	};
+	set_xen_guest_handle(sched_poll.ports, ports);
+
+	rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (rc == -ENOSYS)
+		rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+	return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_poll_no_timeout(
+	evtchn_port_t *ports, unsigned int nr_ports)
+{
+	int rc;
+	struct sched_poll sched_poll = {
+		.nr_ports = nr_ports
+	};
+	set_xen_guest_handle(sched_poll.ports, ports);
+
+	rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (rc == -ENOSYS)
+		rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+	return rc;
+}
+
+#ifdef CONFIG_XEN
+
+static inline void
+MULTI_update_va_mapping(
+    multicall_entry_t *mcl, unsigned long va,
+    pte_t new_val, unsigned long flags)
+{
+    mcl->op = __HYPERVISOR_update_va_mapping;
+    mcl->args[0] = va;
+#if defined(CONFIG_X86_64)
+    mcl->args[1] = new_val.pte;
+#elif defined(CONFIG_X86_PAE)
+    mcl->args[1] = new_val.pte_low;
+    mcl->args[2] = new_val.pte_high;
+#else
+    mcl->args[1] = new_val.pte_low;
+    mcl->args[2] = 0;
+#endif
+    mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
+}
+
+static inline void
+MULTI_mmu_update(multicall_entry_t *mcl, mmu_update_t *req,
+		 unsigned int count, unsigned int *success_count,
+		 domid_t domid)
+{
+    mcl->op = __HYPERVISOR_mmu_update;
+    mcl->args[0] = (unsigned long)req;
+    mcl->args[1] = count;
+    mcl->args[2] = (unsigned long)success_count;
+    mcl->args[3] = domid;
+}
+
+static inline void
+MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
+		     void *uop, unsigned int count)
+{
+    mcl->op = __HYPERVISOR_grant_table_op;
+    mcl->args[0] = cmd;
+    mcl->args[1] = (unsigned long)uop;
+    mcl->args[2] = count;
+}
+
+#else /* !defined(CONFIG_XEN) */
+
+/* Multicalls not supported for HVM guests. */
+#define MULTI_update_va_mapping(a,b,c,d) ((void)0)
+#define MULTI_grant_table_op(a,b,c,d) ((void)0)
+
+#endif
+
+#define uvm_multi(cpumask) ((unsigned long)cpumask_bits(cpumask) | UVMF_MULTI)
+
+#ifdef LINUX
+/* drivers/staging/ use Windows-style types, including VOID */
+#undef VOID
+#endif
+
+#endif /* __HYPERVISOR_H__ */
diff --git a/arch/x86/include/mach-xen/asm/i387.h b/arch/x86/include/mach-xen/asm/i387.h
new file mode 100644
index 0000000..4922b04
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/i387.h
@@ -0,0 +1,55 @@
+#ifndef _ASM_X86_I387_H
+#define switch_fpu_prepare native_switch_fpu_prepare
+#include_next <asm/i387.h>
+
+#ifndef __ASSEMBLY__
+static inline void xen_thread_fpu_begin(struct task_struct *tsk,
+					multicall_entry_t *mcl)
+{
+	if (mcl) {
+		mcl->op = __HYPERVISOR_fpu_taskswitch;
+		mcl->args[0] = 0;
+	}
+	__thread_set_has_fpu(tsk);
+}
+
+static inline fpu_switch_t xen_switch_fpu_prepare(struct task_struct *old,
+						  struct task_struct *new,
+						  int cpu,
+						  multicall_entry_t **mcl)
+{
+	fpu_switch_t fpu;
+
+	fpu.preload = tsk_used_math(new) && new->fpu_counter > 5;
+	if (__thread_has_fpu(old)) {
+		if (!__save_init_fpu(old))
+			cpu = ~0;
+		old->thread.fpu.last_cpu = cpu;
+		old->thread.fpu.has_fpu = 0;	/* But leave fpu_owner_task! */
+
+		/* Don't change CR0.TS if we just switch! */
+		if (fpu.preload) {
+			new->fpu_counter++;
+			__thread_set_has_fpu(new);
+			prefetch(new->thread.fpu.state);
+		} else {
+			(*mcl)->op = __HYPERVISOR_fpu_taskswitch;
+			(*mcl)++->args[0] = 1;
+		}
+	} else {
+		old->fpu_counter = 0;
+		old->thread.fpu.last_cpu = ~0;
+		if (fpu.preload) {
+			new->fpu_counter++;
+			if (fpu_lazy_restore(new, cpu))
+				fpu.preload = 0;
+			else
+				prefetch(new->thread.fpu.state);
+			xen_thread_fpu_begin(new, (*mcl)++);
+		}
+	}
+	return fpu;
+}
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/mach-xen/asm/io.h b/arch/x86/include/mach-xen/asm/io.h
new file mode 100644
index 0000000..2d07f8a
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/io.h
@@ -0,0 +1,343 @@
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ *		Linus
+ */
+
+ /*
+  *  Bit simplified and optimized by Jan Hubicka
+  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+  *
+  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+  *  isa_read[wl] and isa_write[wl] fixed
+  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+  */
+
+#define ARCH_HAS_IOREMAP_WC
+
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <asm/page.h>
+#ifdef __KERNEL__
+#include <asm/fixmap.h>
+#endif
+
+#define build_mmio_read(name, size, type, reg, barrier) \
+static inline type name(const volatile void __iomem *addr) \
+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+:"m" (*(volatile type __force *)addr) barrier); return ret; }
+
+#define build_mmio_write(name, size, type, reg, barrier) \
+static inline void name(type val, volatile void __iomem *addr) \
+{ asm volatile("mov" size " %0,%1": :reg (val), \
+"m" (*(volatile type __force *)addr) barrier); }
+
+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
+
+build_mmio_read(__readb, "b", unsigned char, "=q", )
+build_mmio_read(__readw, "w", unsigned short, "=r", )
+build_mmio_read(__readl, "l", unsigned int, "=r", )
+
+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
+
+build_mmio_write(__writeb, "b", unsigned char, "q", )
+build_mmio_write(__writew, "w", unsigned short, "r", )
+build_mmio_write(__writel, "l", unsigned int, "r", )
+
+#define readb_relaxed(a) __readb(a)
+#define readw_relaxed(a) __readw(a)
+#define readl_relaxed(a) __readl(a)
+#define __raw_readb __readb
+#define __raw_readw __readw
+#define __raw_readl __readl
+
+#define __raw_writeb __writeb
+#define __raw_writew __writew
+#define __raw_writel __writel
+
+#define mmiowb() barrier()
+
+#ifdef CONFIG_X86_64
+
+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+
+#define readq_relaxed(a)	readq(a)
+
+#define __raw_readq(a)		readq(a)
+#define __raw_writeq(val, addr)	writeq(val, addr)
+
+/* Let people know that we have them */
+#define readq			readq
+#define writeq			writeq
+
+#endif
+
+/**
+ *	virt_to_phys	-	map virtual addresses to physical
+ *	@address: address to remap
+ *
+ *	The returned physical address is the physical (CPU) mapping for
+ *	the memory address given. It is only valid to use this function on
+ *	addresses directly mapped or allocated via kmalloc.
+ *
+ *	This function does not give bus mappings for DMA transfers. In
+ *	almost all conceivable cases a device driver should not be using
+ *	this function
+ */
+
+static inline phys_addr_t virt_to_phys(volatile void *address)
+{
+	return __pa(address);
+}
+
+/**
+ *	phys_to_virt	-	map physical address to virtual
+ *	@address: address to remap
+ *
+ *	The returned virtual address is a current CPU mapping for
+ *	the memory address given. It is only valid to use this function on
+ *	addresses that have a kernel mapping
+ *
+ *	This function does not handle bus mappings for DMA transfers. In
+ *	almost all conceivable cases a device driver should not be using
+ *	this function
+ */
+
+static inline void *phys_to_virt(phys_addr_t address)
+{
+	return __va(address);
+}
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#undef page_to_phys
+#define page_to_phys(page)	 (phys_to_machine(page_to_pseudophys(page)))
+#define page_to_bus(page)	 (phys_to_machine(page_to_pseudophys(page)))
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ * However, we truncate the address to unsigned int to avoid undesirable
+ * promitions in legacy drivers.
+ */
+#define isa_virt_to_bus(_x) ({ \
+	unsigned long _va_ = (unsigned long)(_x); \
+	_va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
+	? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
+	: ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
+#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+
+/**
+ * ioremap     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+				unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+	return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+extern void set_iounmap_nonlazy(void);
+
+#ifdef __KERNEL__
+
+#include <asm-generic/iomap.h>
+
+#include <linux/vmalloc.h>
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p)	p
+
+static inline void
+memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
+{
+	memset((void __force *)addr, val, count);
+}
+
+static inline void
+memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
+{
+	memcpy(dst, (const void __force *)src, count);
+}
+
+static inline void
+memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
+{
+	memcpy((void __force *)dst, src, count);
+}
+
+/*
+ *	Cache management
+ *
+ *	This needed for two cases
+ *	1. Out of order aware processors
+ *	2. Accidentally out of order processors (PPro errata #51)
+ */
+
+static inline void flush_write_buffers(void)
+{
+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+	asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+#endif
+}
+
+#endif /* __KERNEL__ */
+
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+static inline void slow_down_io(void)
+{
+	native_io_delay();
+#ifdef REALLY_SLOW_IO
+	native_io_delay();
+	native_io_delay();
+	native_io_delay();
+#endif
+}
+
+#define BUILDIO(bwl, bw, type)						\
+static inline void out##bwl(unsigned type value, int port)		\
+{									\
+	asm volatile("out" #bwl " %" #bw "0, %w1"			\
+		     : : "a"(value), "Nd"(port));			\
+}									\
+									\
+static inline unsigned type in##bwl(int port)				\
+{									\
+	unsigned type value;						\
+	asm volatile("in" #bwl " %w1, %" #bw "0"			\
+		     : "=a"(value) : "Nd"(port));			\
+	return value;							\
+}									\
+									\
+static inline void out##bwl##_p(unsigned type value, int port)		\
+{									\
+	out##bwl(value, port);						\
+	slow_down_io();							\
+}									\
+									\
+static inline unsigned type in##bwl##_p(int port)			\
+{									\
+	unsigned type value = in##bwl(port);				\
+	slow_down_io();							\
+	return value;							\
+}									\
+									\
+static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+{									\
+	asm volatile("rep; outs" #bwl					\
+		     : "+S"(addr), "+c"(count) : "d"(port));		\
+}									\
+									\
+static inline void ins##bwl(int port, void *addr, unsigned long count)	\
+{									\
+	asm volatile("rep; ins" #bwl					\
+		     : "+D"(addr), "+c"(count) : "d"(port));		\
+}
+
+BUILDIO(b, b, char)
+BUILDIO(w, w, short)
+BUILDIO(l, , int)
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
+/* We will be supplying our own /dev/mem implementation */
+#define ARCH_HAS_DEV_MEM
+
+#define bvec_to_pseudophys(bv)	 (page_to_pseudophys((bv)->bv_page) + \
+				  (unsigned long)(bv)->bv_offset)
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+	(__BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+	 && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
+	    == bvec_to_pseudophys(vec2))
+
+#endif
+
+extern void *xlate_dev_mem_ptr(unsigned long phys);
+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+
+extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
+				     unsigned long prot_val);
+extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+
+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+extern void early_ioremap_init(void);
+extern void early_ioremap_reset(void);
+extern void __iomem *early_ioremap(resource_size_t phys_addr,
+				   unsigned long size);
+extern void __iomem *early_memremap(resource_size_t phys_addr,
+				    unsigned long size);
+extern void __iomem *early_memremap_ro(resource_size_t phys_addr,
+				       unsigned long size);
+extern void early_iounmap(void __iomem *addr, unsigned long size);
+extern void fixup_early_ioremap(void);
+extern bool is_early_ioremap_ptep(pte_t *ptep);
+
+#define IO_SPACE_LIMIT 0xffff
+
+#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/mach-xen/asm/ipi.h b/arch/x86/include/mach-xen/asm/ipi.h
new file mode 100644
index 0000000..4bdda1d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/ipi.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_IPI_H
+#define _ASM_X86_IPI_H
+
+#include <asm/hw_irq.h>
+#include <asm/smp.h>
+
+void xen_send_IPI_mask(const struct cpumask *, int vector);
+void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector);
+void xen_send_IPI_allbutself(int vector);
+void xen_send_IPI_all(int vector);
+void xen_send_IPI_self(int vector);
+
+#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/mach-xen/asm/irq_vectors.h b/arch/x86/include/mach-xen/asm/irq_vectors.h
new file mode 100644
index 0000000..7798731
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/irq_vectors.h
@@ -0,0 +1,98 @@
+#ifndef _ASM_X86_IRQ_VECTORS_H
+#define _ASM_X86_IRQ_VECTORS_H
+
+#define MCE_VECTOR			0x12
+
+#define IA32_SYSCALL_VECTOR		0x80
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR			0x80
+#endif
+
+#define RESCHEDULE_VECTOR		0
+#define CALL_FUNCTION_VECTOR		1
+#define NMI_VECTOR			0x02
+#define CALL_FUNC_SINGLE_VECTOR		3
+#define REBOOT_VECTOR			4
+#ifdef CONFIG_IRQ_WORK
+#define IRQ_WORK_VECTOR			5
+#define NR_IPIS				6
+#else
+#define NR_IPIS				5
+#endif
+
+/*
+ * The maximum number of vectors supported by i386 processors
+ * is limited to 256. For processors other than i386, NR_VECTORS
+ * should be changed accordingly.
+ */
+#define NR_VECTORS			 256
+
+#define	FIRST_VM86_IRQ			   3
+#define LAST_VM86_IRQ			  15
+
+#ifndef __ASSEMBLY__
+static inline int invalid_vm86_irq(int irq)
+{
+	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+#endif
+
+/*
+ * Size the maximum number of interrupts.
+ *
+ * If the irq_desc[] array has a sparse layout, we can size things
+ * generously - it scales up linearly with the maximum number of CPUs,
+ * and the maximum number of IO-APICs, whichever is higher.
+ *
+ * In other cases we size more conservatively, to not create too large
+ * static arrays.
+ */
+
+#define NR_IRQS_LEGACY			  16
+
+/*
+ * The flat IRQ space is divided into two regions:
+ *  1. A one-to-one mapping of real physical IRQs. This space is only used
+ *     if we have physical device-access privilege. This region is at the
+ *     start of the IRQ space so that existing device drivers do not need
+ *     to be modified to translate physical IRQ numbers into our IRQ space.
+ *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
+ *     are bound using the provided bind/unbind functions.
+ */
+#define PIRQ_BASE			0
+/* PHYSDEVOP_pirq_eoi_gmfn restriction: */
+#define PIRQ_MAX(n) ((n) < (1 << (PAGE_SHIFT + 3)) - NR_VECTORS \
+		   ? (n) : (1 << (PAGE_SHIFT + 3)) - NR_VECTORS)
+
+#define IO_APIC_VECTOR_LIMIT		PIRQ_MAX(32 * MAX_IO_APICS)
+#define CPU_VECTOR_LIMIT		PIRQ_MAX(64 * NR_CPUS)
+
+#if defined(CONFIG_X86_IO_APIC)
+# define NR_PIRQS					\
+	(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?	\
+		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
+		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
+#elif defined(CONFIG_XEN_PCIDEV_FRONTEND)
+# define NR_PIRQS			(NR_VECTORS + CPU_VECTOR_LIMIT)
+#else /* !CONFIG_X86_IO_APIC: */
+# define NR_PIRQS			NR_IRQS_LEGACY
+#endif
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_SPARSE_IRQ
+extern int nr_pirqs;
+#else
+# define nr_pirqs			NR_PIRQS
+#endif
+#endif
+
+#define DYNIRQ_BASE			(PIRQ_BASE + nr_pirqs)
+#ifdef CONFIG_SPARSE_IRQ
+#define NR_DYNIRQS			(CPU_VECTOR_LIMIT + CONFIG_XEN_NR_GUEST_DEVICES)
+#else
+#define NR_DYNIRQS			(64 + CONFIG_XEN_NR_GUEST_DEVICES)
+#endif
+
+#define NR_IRQS				(NR_PIRQS + NR_DYNIRQS)
+
+#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/mach-xen/asm/irqflags.h b/arch/x86/include/mach-xen/asm/irqflags.h
new file mode 100644
index 0000000..95d336f
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/irqflags.h
@@ -0,0 +1,212 @@
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/smp-processor-id.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <xen/interface/vcpu.h>
+/*
+ * The use of 'barrier' in the following reflects their use as local-lock
+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
+ * critical operations are executed. All critical operations must complete
+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
+ * includes these barriers, for example.
+ */
+
+#define xen_save_fl(void) vcpu_info_read(evtchn_upcall_mask)
+
+#define xen_restore_fl(f)					\
+do {								\
+	vcpu_info_t *_vcpu;					\
+	barrier();						\
+	_vcpu = current_vcpu_info();				\
+	if ((_vcpu->evtchn_upcall_mask = (f)) == 0) {		\
+		barrier(); /* unmask then check (avoid races) */\
+		if (unlikely(_vcpu->evtchn_upcall_pending))	\
+			force_evtchn_callback();		\
+	}							\
+} while (0)
+
+#define xen_irq_disable()					\
+do {								\
+	vcpu_info_write(evtchn_upcall_mask, 1);			\
+	barrier();						\
+} while (0)
+
+#define xen_irq_enable()					\
+do {								\
+	vcpu_info_t *_vcpu;					\
+	barrier();						\
+	_vcpu = current_vcpu_info();				\
+	_vcpu->evtchn_upcall_mask = 0;				\
+	barrier(); /* unmask then check (avoid races) */	\
+	if (unlikely(_vcpu->evtchn_upcall_pending))		\
+		force_evtchn_callback();			\
+} while (0)
+
+#define arch_local_save_flags() xen_save_fl()
+
+#define arch_local_irq_restore(flags) xen_restore_fl(flags)
+
+#define arch_local_irq_disable()	xen_irq_disable()
+
+#define arch_local_irq_enable() xen_irq_enable()
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+#define arch_safe_halt HYPERVISOR_block
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+#define halt() VOID(irqs_disabled()					\
+		    ? HYPERVISOR_vcpu_op(VCPUOP_down,			\
+					 smp_processor_id(), NULL)	\
+		    : 0)
+
+/*
+ * For spinlocks, etc:
+ */
+#define arch_local_irq_save()						\
+({									\
+	unsigned long flags = arch_local_save_flags();			\
+									\
+	arch_local_irq_disable();					\
+									\
+	flags;								\
+})
+#else
+
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending		/* 0 */
+#define evtchn_upcall_mask		1
+
+#ifdef CONFIG_X86_64
+# define __REG_si %rsi
+# define __CPU_num PER_CPU_VAR(cpu_number)
+#else
+# define __REG_si %esi
+# define __CPU_num TI_cpu(%ebp)
+#endif
+
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+
+#define GET_VCPU_INFO		PER_CPU(vcpu_info, __REG_si)
+#define __DISABLE_INTERRUPTS	movb $1,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask)
+#define __ENABLE_INTERRUPTS	movb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask)
+#define __TEST_PENDING		cmpb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_pending+0)
+#define DISABLE_INTERRUPTS(clb)	__DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS(clb)	__ENABLE_INTERRUPTS
+
+#define __SIZEOF_DISABLE_INTERRUPTS 8
+#define __SIZEOF_TEST_PENDING	8
+
+#else /* CONFIG_XEN_VCPU_INFO_PLACEMENT */
+
+#define sizeof_vcpu_shift	6
+
+#ifdef CONFIG_SMP
+#define GET_VCPU_INFO		movl __CPU_num,%esi			; \
+				shl $sizeof_vcpu_shift,%esi		; \
+				add HYPERVISOR_shared_info,__REG_si
+#else
+#define GET_VCPU_INFO		mov HYPERVISOR_shared_info,__REG_si
+#endif
+
+#define __DISABLE_INTERRUPTS	movb $1,evtchn_upcall_mask(__REG_si)
+#define __ENABLE_INTERRUPTS	movb $0,evtchn_upcall_mask(__REG_si)
+#define __TEST_PENDING		testb $0xFF,evtchn_upcall_pending(__REG_si)
+#define DISABLE_INTERRUPTS(clb)	GET_VCPU_INFO				; \
+				__DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS(clb)	GET_VCPU_INFO				; \
+				__ENABLE_INTERRUPTS
+
+#define __SIZEOF_DISABLE_INTERRUPTS 4
+#define __SIZEOF_TEST_PENDING	3
+
+#endif /* CONFIG_XEN_VCPU_INFO_PLACEMENT */
+
+#ifndef CONFIG_X86_64
+#define INTERRUPT_RETURN		iret
+#define ENABLE_INTERRUPTS_SYSEXIT					  \
+	movb $0,evtchn_upcall_mask(%esi) /* __ENABLE_INTERRUPTS */	; \
+sysexit_scrit:	/**** START OF SYSEXIT CRITICAL REGION ****/		; \
+	cmpb $0,evtchn_upcall_pending(%esi) /* __TEST_PENDING */	; \
+	jnz  14f	/* process more events if necessary... */	; \
+	movl PT_ESI(%esp), %esi						; \
+	sysexit								; \
+14:	movb $1,evtchn_upcall_mask(%esi) /* __DISABLE_INTERRUPTS */	; \
+	TRACE_IRQS_OFF							; \
+sysexit_ecrit:	/**** END OF SYSEXIT CRITICAL REGION ****/		; \
+	mov  $__KERNEL_PERCPU, %ecx					; \
+	push %esp							; \
+	mov  %ecx, %fs							; \
+	SET_KERNEL_GS %ecx						; \
+	call evtchn_do_upcall						; \
+	add  $4,%esp							; \
+	jmp  ret_from_intr
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+
+#ifndef __ASSEMBLY__
+static inline int arch_irqs_disabled_flags(unsigned long flags)
+{
+	return (flags != 0);
+}
+
+#define arch_irqs_disabled()						\
+({									\
+	unsigned long flags = arch_local_save_flags();			\
+									\
+	arch_irqs_disabled_flags(flags);				\
+})
+
+#else
+
+#ifdef CONFIG_X86_64
+#define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
+	TRACE_IRQS_ON; \
+	ENABLE_INTERRUPTS(CLBR_NONE); \
+	SAVE_REST; \
+	LOCKDEP_SYS_EXIT; \
+	RESTORE_REST; \
+	__DISABLE_INTERRUPTS; \
+	TRACE_IRQS_OFF;
+
+#else
+#define ARCH_LOCKDEP_SYS_EXIT			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call lockdep_sys_exit;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk;
+#else
+#  define TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#  define LOCKDEP_SYS_EXIT	ARCH_LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ	ARCH_LOCKDEP_SYS_EXIT_IRQ
+# else
+#  define LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ
+# endif
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/x86/include/mach-xen/asm/mach_traps.h b/arch/x86/include/mach-xen/asm/mach_traps.h
new file mode 100644
index 0000000..99314d3
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mach_traps.h
@@ -0,0 +1,37 @@
+/*
+ *  include/asm-xen/asm-i386/mach-xen/mach_traps.h
+ *
+ *  Machine specific NMI handling for Xen
+ */
+#ifndef _MACH_TRAPS_H
+#define _MACH_TRAPS_H
+
+#include <linux/bitops.h>
+#include <xen/interface/nmi.h>
+
+#define NMI_REASON_SERR		0x80
+#define NMI_REASON_IOCHK	0x40
+#define NMI_REASON_MASK		(NMI_REASON_SERR | NMI_REASON_IOCHK)
+
+static inline void clear_serr_error(unsigned char reason) {}
+static inline void clear_io_check_error(unsigned char reason) {}
+
+static inline unsigned char xen_get_nmi_reason(void)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	unsigned char reason = 0;
+
+	/* construct a value which looks like it came from
+	 * port 0x61.
+	 */
+	if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
+		reason |= NMI_REASON_IOCHK;
+	if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
+		reason |= NMI_REASON_SERR;
+
+        return reason;
+}
+
+static inline void reassert_nmi(void) {}
+
+#endif /* !_MACH_TRAPS_H */
diff --git a/arch/x86/include/mach-xen/asm/maddr.h b/arch/x86/include/mach-xen/asm/maddr.h
new file mode 100644
index 0000000..455e848
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr.h
@@ -0,0 +1,155 @@
+#ifndef _X86_MADDR_H
+#define _X86_MADDR_H
+
+#include <asm/asm.h>
+#include <asm/bug.h>
+#include <xen/features.h>
+#include <xen/interface/xen.h>
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY	(~0UL)
+#define FOREIGN_FRAME_BIT	(1UL << (BITS_PER_LONG - 1))
+#define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
+
+/* Definitions for machine and pseudophysical addresses. */
+#ifdef CONFIG_X86_PAE
+typedef unsigned long long paddr_t;
+typedef unsigned long long maddr_t;
+#else
+typedef unsigned long paddr_t;
+typedef unsigned long maddr_t;
+#endif
+
+#ifdef CONFIG_XEN
+
+extern unsigned long *phys_to_machine_mapping;
+extern unsigned long  max_mapnr;
+
+#undef machine_to_phys_mapping
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned long  machine_to_phys_nr;
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+		return pfn;
+	if (likely(max_mapnr))
+		BUG_ON(pfn >= max_mapnr);
+	return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+		return 1;
+	if (likely(max_mapnr))
+		BUG_ON(pfn >= max_mapnr);
+	return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+	unsigned long pfn;
+
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+		return mfn;
+
+	if (unlikely(mfn >= machine_to_phys_nr))
+		return max_mapnr;
+
+	/* The array access can fail (e.g., device space beyond end of RAM). */
+	asm (
+		"1:	"_ASM_MOV" %1,%0\n"
+		"2:\n"
+		".section .fixup,\"ax\"\n"
+		"3:	"_ASM_MOV" %2,%0\n"
+		"	jmp  2b\n"
+		".previous\n"
+		_ASM_EXTABLE(1b,3b)
+		: "=r" (pfn)
+		: "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
+
+	return pfn;
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ *  1. If the MFN is an I/O page then Xen will set the m2p entry
+ *     to be outside our maximum possible pseudophys range.
+ *  2. If the MFN belongs to a different domain then we will certainly
+ *     not have MFN in our p2m table. Conversely, if the page is ours,
+ *     then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ *      require. In all the cases we care about, the FOREIGN_FRAME bit is
+ *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(phys_addr_t mfn)
+{
+	unsigned long pfn = mfn_to_pfn(mfn);
+	if (likely(pfn < max_mapnr)
+	    && likely(!xen_feature(XENFEAT_auto_translated_physmap))
+	    && unlikely(phys_to_machine_mapping[pfn] != mfn))
+		return max_mapnr; /* force !pfn_valid() */
+	return pfn;
+}
+
+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	if (likely(max_mapnr))
+		BUG_ON(pfn >= max_mapnr);
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+		return;
+	}
+	phys_to_machine_mapping[pfn] = mfn;
+}
+
+static inline maddr_t phys_to_machine(paddr_t phys)
+{
+	maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
+	machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
+	return machine;
+}
+
+static inline paddr_t machine_to_phys(maddr_t machine)
+{
+	paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
+	phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
+	return phys;
+}
+
+#ifdef CONFIG_X86_32
+# include "maddr_32.h"
+#else
+# include "maddr_64.h"
+#endif
+
+#else /* !CONFIG_XEN */
+
+#define pfn_to_mfn(pfn) (pfn)
+#define mfn_to_pfn(mfn) (mfn)
+#define mfn_to_local_pfn(mfn) (mfn)
+#define set_phys_to_machine(pfn, mfn) ((void)0)
+#define phys_to_machine_mapping_valid(pfn) 1
+#define phys_to_machine(phys) ((maddr_t)(phys))
+#define machine_to_phys(mach) ((paddr_t)(mach))
+#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
+#define __pte_ma(x) __pte(x)
+
+#endif /* !CONFIG_XEN */
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v)	phys_to_machine(__pa(v))
+#define virt_to_mfn(v)		pfn_to_mfn(__pa(v) >> PAGE_SHIFT)
+#define mfn_to_virt(m)		__va(mfn_to_pfn(m) << PAGE_SHIFT)
+
+#endif /* _X86_MADDR_H */
diff --git a/arch/x86/include/mach-xen/asm/maddr_32.h b/arch/x86/include/mach-xen/asm/maddr_32.h
new file mode 100644
index 0000000..de34d87
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr_32.h
@@ -0,0 +1,35 @@
+#ifndef _I386_MADDR_H
+#define _I386_MADDR_H
+
+#ifdef CONFIG_X86_PAE
+static inline paddr_t pte_phys_to_machine(paddr_t phys)
+{
+	/*
+	 * In PAE mode, the NX bit needs to be dealt with in the value
+	 * passed to pfn_to_mfn(). On x86_64, we need to mask it off,
+	 * but for i386 the conversion to ulong for the argument will
+	 * clip it off.
+	 */
+	maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
+	machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
+	return machine;
+}
+
+static inline paddr_t pte_machine_to_phys(maddr_t machine)
+{
+	/*
+	 * In PAE mode, the NX bit needs to be dealt with in the value
+	 * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
+	 * but for i386 the conversion to ulong for the argument will
+	 * clip it off.
+	 */
+	paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
+	phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
+	return phys;
+}
+#else
+#define pte_phys_to_machine phys_to_machine
+#define pte_machine_to_phys machine_to_phys
+#endif
+
+#endif /* _I386_MADDR_H */
diff --git a/arch/x86/include/mach-xen/asm/maddr_64.h b/arch/x86/include/mach-xen/asm/maddr_64.h
new file mode 100644
index 0000000..e2c271e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr_64.h
@@ -0,0 +1,21 @@
+#ifndef _X86_64_MADDR_H
+#define _X86_64_MADDR_H
+
+static inline paddr_t pte_phys_to_machine(paddr_t phys)
+{
+	maddr_t machine;
+	machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
+	machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
+	return machine;
+}
+
+static inline paddr_t pte_machine_to_phys(maddr_t machine)
+{
+	paddr_t phys;
+	phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
+	phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
+	return phys;
+}
+
+#endif /* _X86_64_MADDR_H */
+
diff --git a/arch/x86/include/mach-xen/asm/mmu_context.h b/arch/x86/include/mach-xen/asm/mmu_context.h
new file mode 100644
index 0000000..1fbe9dd
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mmu_context.h
@@ -0,0 +1,165 @@
+#ifndef _ASM_X86_MMU_CONTEXT_H
+#define _ASM_X86_MMU_CONTEXT_H
+
+#include <asm/desc.h>
+#include <linux/atomic.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+void arch_exit_mmap(struct mm_struct *mm);
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+
+void mm_pin(struct mm_struct *mm);
+void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
+
+static inline void xen_activate_mm(struct mm_struct *prev,
+				   struct mm_struct *next)
+{
+	if (!PagePinned(virt_to_page(next->pgd)))
+		mm_pin(next);
+}
+
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+		percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+#endif
+}
+
+#define prepare_arch_switch(next)	__prepare_arch_switch()
+
+static inline void __prepare_arch_switch(void)
+{
+#ifdef CONFIG_X86_32
+	/*
+	 * Save away %gs. No need to save %fs, as it was saved on the
+	 * stack on entry.  No need to save %es and %ds, as those are
+	 * always kernel segments while inside the kernel.
+	 */
+	lazy_save_gs(current->thread.gs);
+	lazy_load_gs(__KERNEL_STACK_CANARY);
+#else
+	/*
+	 * Save away %es, %ds, %fs and %gs. Must happen before reload
+	 * of cr3/ldt (i.e., not in __switch_to).
+	 */
+	__asm__ __volatile__ (
+		"mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
+		: "=m" (current->thread.es),
+		  "=m" (current->thread.ds),
+		  "=m" (current->thread.fsindex),
+		  "=m" (current->thread.gsindex) );
+
+	if (current->thread.ds)
+		__asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
+
+	if (current->thread.es)
+		__asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
+
+	if (current->thread.fsindex) {
+		__asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
+		current->thread.fs = 0;
+	}
+
+	if (current->thread.gsindex) {
+		load_gs_index(0);
+		current->thread.gs = 0;
+	}
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+			     struct task_struct *tsk)
+{
+	unsigned cpu = smp_processor_id();
+	struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
+#ifdef CONFIG_X86_64
+	pgd_t *upgd;
+#endif
+
+	if (likely(prev != next)) {
+		BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+		       !PagePinned(virt_to_page(next->pgd)));
+
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		percpu_write(cpu_tlbstate.active_mm, next);
+#endif
+		cpumask_set_cpu(cpu, mm_cpumask(next));
+
+		/* Re-load page tables: load_cr3(next->pgd) */
+		op->cmd = MMUEXT_NEW_BASEPTR;
+		op->arg1.mfn = virt_to_mfn(next->pgd);
+		op++;
+
+		/* xen_new_user_pt(next->pgd) */
+#ifdef CONFIG_X86_64
+		op->cmd = MMUEXT_NEW_USER_BASEPTR;
+		upgd = __user_pgd(next->pgd);
+		op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0;
+		op++;
+#endif
+
+		/*
+		 * load the LDT, if the LDT is different:
+		 */
+		if (unlikely(prev->context.ldt != next->context.ldt)) {
+			/* load_LDT_nolock(&next->context) */
+			op->cmd = MMUEXT_SET_LDT;
+			op->arg1.linear_addr = (unsigned long)next->context.ldt;
+			op->arg2.nr_ents     = next->context.size;
+			op++;
+		}
+
+		BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
+
+		/* stop TLB flushes for the previous mm */
+		cpumask_clear_cpu(cpu, mm_cpumask(prev));
+	}
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+	else {
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
+		if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
+			/* We were in lazy tlb mode and leave_mm disabled
+			 * tlb flush IPI delivery. We must reload CR3
+			 * to make sure to use no freed page tables.
+			 */
+			load_cr3(next->pgd);
+			xen_new_user_pt(next->pgd);
+			load_LDT_nolock(&next->context);
+		}
+	}
+#endif
+}
+
+#define activate_mm(prev, next)			\
+do {						\
+	xen_activate_mm(prev, next);		\
+	switch_mm((prev), (next), NULL);	\
+} while (0);
+
+#ifdef CONFIG_X86_32
+#define deactivate_mm(tsk, mm)			\
+do {						\
+	lazy_load_gs(0);			\
+} while (0)
+#else
+#define deactivate_mm(tsk, mm)			\
+do {						\
+	load_gs_index(0);			\
+	loadsegment(fs, 0);			\
+} while (0)
+#endif
+
+#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/mach-xen/asm/mutex.h b/arch/x86/include/mach-xen/asm/mutex.h
new file mode 100644
index 0000000..ee9126e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mutex.h
@@ -0,0 +1,3 @@
+#define arch_cpu_is_running(cpu) vcpu_running(cpu)
+
+#include_next <asm/mutex.h>
diff --git a/arch/x86/include/mach-xen/asm/pci.h b/arch/x86/include/mach-xen/asm/pci.h
new file mode 100644
index 0000000..54289aa
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pci.h
@@ -0,0 +1,180 @@
+#ifndef _ASM_X86_PCI_H
+#define _ASM_X86_PCI_H
+
+#include <linux/mm.h> /* for struct page */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <asm/scatterlist.h>
+#include <asm/io.h>
+#include <asm/x86_init.h>
+
+#ifdef __KERNEL__
+
+struct pci_sysdata {
+	int		domain;		/* PCI domain */
+	int		node;		/* NUMA node */
+#ifdef CONFIG_X86_64
+	void		*iommu;		/* IOMMU private data */
+#endif
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+	struct pcifront_device *pdev;
+#endif
+};
+
+extern int pci_routeirq;
+extern int noioapicquirk;
+extern int noioapicreroute;
+
+/* scan a bus after allocating a pci_sysdata for it */
+extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+					    int node);
+extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
+
+#ifdef CONFIG_PCI
+
+#ifdef CONFIG_PCI_DOMAINS
+static inline int pci_domain_nr(struct pci_bus *bus)
+{
+	struct pci_sysdata *sd = bus->sysdata;
+	return sd->domain;
+}
+
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+	return pci_domain_nr(bus);
+}
+#endif
+
+/* Can be used to override the logic in pci_scan_bus for skipping
+   already-configured bus numbers - to be used for buggy BIOSes
+   or architectures with incomplete PCI setup by the loader */
+
+extern unsigned int pcibios_assign_all_busses(void);
+extern int pci_legacy_init(void);
+# ifdef CONFIG_ACPI
+#  define x86_default_pci_init pci_acpi_init
+# else
+#  define x86_default_pci_init pci_legacy_init
+# endif
+#else
+# define pcibios_assign_all_busses()	0
+# define x86_default_pci_init		NULL
+#endif
+
+#include <asm/hypervisor.h>
+#define pcibios_scan_all_fns(a, b)	(!is_initial_xendomain())
+
+extern unsigned long pci_mem_start;
+#define PCIBIOS_MIN_IO		0x1000
+#define PCIBIOS_MIN_MEM		(pci_mem_start)
+
+#define PCIBIOS_MIN_CARDBUS_IO	0x4000
+
+extern int pcibios_enabled;
+void pcibios_config_init(void);
+struct pci_bus *pcibios_scan_root(int bus);
+
+void pcibios_set_master(struct pci_dev *dev);
+void pcibios_penalize_isa_irq(int irq, int active);
+struct irq_routing_table *pcibios_get_irq_routing_table(void);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+
+
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+			       enum pci_mmap_state mmap_state,
+			       int write_combine);
+
+
+#ifdef CONFIG_PCI
+extern void early_quirks(void);
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+#else
+static inline void early_quirks(void) { }
+#endif
+
+extern void pci_iommu_alloc(void);
+
+#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
+/* MSI arch specific hooks */
+static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
+{
+	x86_msi.teardown_msi_irqs(dev);
+}
+
+static inline void x86_teardown_msi_irq(unsigned int irq)
+{
+	x86_msi.teardown_msi_irq(irq);
+}
+static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+	x86_msi.restore_msi_irqs(dev, irq);
+}
+#define arch_setup_msi_irqs x86_setup_msi_irqs
+#define arch_teardown_msi_irqs x86_teardown_msi_irqs
+#define arch_teardown_msi_irq x86_teardown_msi_irq
+#define arch_restore_msi_irqs x86_restore_msi_irqs
+/* implemented in arch/x86/kernel/apic/io_apic. */
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev, int irq);
+/* default to the implementation in drivers/lib/msi.c */
+#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+#define HAVE_DEFAULT_MSI_RESTORE_IRQS
+void default_teardown_msi_irqs(struct pci_dev *dev);
+void default_restore_msi_irqs(struct pci_dev *dev, int irq);
+#else
+#define native_setup_msi_irqs		NULL
+#define native_teardown_msi_irq		NULL
+#define default_teardown_msi_irqs	NULL
+#define default_restore_msi_irqs	NULL
+#endif
+
+#define PCI_DMA_BUS_IS_PHYS 0
+
+#endif  /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+#include "../../asm/pci_64.h"
+#endif
+
+/* implement the pci_ DMA API in terms of the generic device dma_ one */
+#include <asm-generic/pci-dma-compat.h>
+
+/* generic pci stuff */
+#include <asm-generic/pci.h>
+#define PCIBIOS_MAX_MEM_32 0xffffffff
+
+#ifdef CONFIG_NUMA
+/* Returns the node based on pci bus */
+static inline int __pcibus_to_node(const struct pci_bus *bus)
+{
+	const struct pci_sysdata *sd = bus->sysdata;
+
+	return sd->node;
+}
+
+static inline const struct cpumask *
+cpumask_of_pcibus(const struct pci_bus *bus)
+{
+	int node;
+
+	node = __pcibus_to_node(bus);
+	return (node == -1) ? cpu_online_mask :
+			      cpumask_of_node(node);
+}
+#endif
+
+#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/mach-xen/asm/percpu.h b/arch/x86/include/mach-xen/asm/percpu.h
new file mode 100644
index 0000000..336a525
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/percpu.h
@@ -0,0 +1,61 @@
+#ifndef _ASM_X86_XEN_PERCPU_H
+#define _ASM_X86_XEN_PERCPU_H
+
+#include_next <asm/percpu.h>
+
+#define this_vcpu_read_1 this_cpu_read_1
+#define this_vcpu_read_2 this_cpu_read_2
+#define this_vcpu_read_4 this_cpu_read_4
+
+#ifdef CONFIG_64BIT
+# define this_vcpu_read_8 this_cpu_read_8
+#else
+# define this_vcpu_read_8(pcp) ({ \
+	typeof(pcp) res__; \
+	__asm__ ("movl %%ebx,%%eax\n" \
+		 "movl %%ecx,%%edx\n" \
+		 "cmpxchg8b " __percpu_arg(1) \
+		 : "=&A" (res__) : "m" (pcp)); \
+	res__; })
+#endif
+
+#define this_vcpu_read(pcp) __pcpu_size_call_return(this_vcpu_read_, pcp)
+
+#define percpu_exchange_op(op, var, val)		\
+({							\
+	typedef typeof(var) pxo_T__;			\
+	pxo_T__ pxo_ret__;				\
+	if (0) {					\
+		pxo_ret__ = (val);			\
+		(void)pxo_ret__;			\
+	}						\
+	switch (sizeof(var)) {				\
+	case 1:						\
+		asm(op "b %0,"__percpu_arg(1)		\
+		    : "=q" (pxo_ret__), "+m" (var)	\
+		    : "0" ((pxo_T__)(val)));		\
+		break;					\
+	case 2:						\
+		asm(op "w %0,"__percpu_arg(1)		\
+		    : "=r" (pxo_ret__), "+m" (var)	\
+		    : "0" ((pxo_T__)(val)));		\
+		break;					\
+	case 4:						\
+		asm(op "l %0,"__percpu_arg(1)		\
+		    : "=r" (pxo_ret__), "+m" (var)	\
+		    : "0" ((pxo_T__)(val)));		\
+		break;					\
+	case 8:						\
+		asm(op "q %0,"__percpu_arg(1)		\
+		    : "=r" (pxo_ret__), "+m" (var)	\
+		    : "0" ((pxo_T__)(val)));		\
+		break;					\
+	default: __bad_percpu_size();			\
+	}						\
+	pxo_ret__;					\
+})
+
+#define percpu_xchg(var, val)		percpu_exchange_op("xchg", var, val)
+#define percpu_xadd(var, val)		percpu_exchange_op("xadd", var, val)
+
+#endif /* _ASM_X86_XEN_PERCPU_H */
diff --git a/arch/x86/include/mach-xen/asm/perf_event.h b/arch/x86/include/mach-xen/asm/perf_event.h
new file mode 100644
index 0000000..6c784d1
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/perf_event.h
@@ -0,0 +1,42 @@
+#ifndef _ASM_X86_PERF_EVENT_H
+#define _ASM_X86_PERF_EVENT_H
+
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
+ * This flag is otherwise unused and ABI specified to be 0, so nobody should
+ * care what we do with it.
+ */
+#define PERF_EFLAGS_EXACT	(1UL << 3)
+
+#define perf_instruction_pointer(regs) instruction_pointer(regs)
+
+#define perf_misc_flags(regs) ({ \
+	struct pt_regs *_r_ = (regs); \
+	unsigned long _f_ = user_mode(_r_) ? PERF_RECORD_MISC_USER \
+					   : PERF_RECORD_MISC_KERNEL; \
+	_r_->flags & PERF_EFLAGS_EXACT ? _f_ | PERF_RECORD_MISC_EXACT_IP : _f_; \
+})
+
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip)		{	\
+	(regs)->ip = (__ip);					\
+	(regs)->bp = caller_frame_pointer();			\
+	(regs)->cs = __KERNEL_CS;				\
+	regs->flags = 0;					\
+	asm volatile(						\
+		_ASM_MOV "%%"_ASM_SP ", %0\n"			\
+		: "=m" ((regs)->sp)				\
+		:: "memory"					\
+	);							\
+}
+
+#endif
+
+#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/mach-xen/asm/pgalloc.h b/arch/x86/include/mach-xen/asm/pgalloc.h
new file mode 100644
index 0000000..3879075
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgalloc.h
@@ -0,0 +1,159 @@
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h>		/* for struct page */
+#include <linux/pagemap.h>
+
+#include <asm/io.h>		/* for phys_to_virt and page_to_pseudophys */
+
+static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
+
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)	{}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)	{}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+					    unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)	{}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
+
+#ifdef CONFIG_X86_64
+void early_make_page_readonly(void *va, unsigned int feature);
+pmd_t *early_get_pmd(unsigned long va);
+#define make_lowmem_page_readonly make_page_readonly
+#define make_lowmem_page_writable make_page_writable
+#endif
+
+/*
+ * Flags to use when allocating a user page table page.
+ */
+extern gfp_t __userpte_alloc_gfp;
+
+/*
+ * Allocate and free page tables.
+ */
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
+/* Should really implement gc for free page table pages. This could be
+   done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+	BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+	make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
+	free_page((unsigned long)pte);
+}
+
+extern void __pte_free(pgtable_t);
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+	__pte_free(pte);
+}
+
+extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+				  unsigned long address)
+{
+	___pte_free_tlb(tlb, pte);
+}
+
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+				       pmd_t *pmd, pte_t *pte)
+{
+	paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
+	set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+				struct page *pte)
+{
+	unsigned long pfn = page_to_pfn(pte);
+	pmd_t ent = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
+
+	paravirt_alloc_pte(mm, pfn);
+	if (PagePinned(virt_to_page(pmd))) {
+#ifndef CONFIG_HIGHPTE
+		BUG_ON(PageHighMem(pte));
+#endif
+		set_pmd(pmd, ent);
+	} else
+		*pmd = ent;
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if PAGETABLE_LEVELS > 2
+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
+extern void __pmd_free(pgtable_t);
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+	__pmd_free(virt_to_page(pmd));
+}
+
+extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
+				  unsigned long address)
+{
+	___pmd_free_tlb(tlb, pmd);
+}
+
+#ifdef CONFIG_X86_PAE
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
+#else	/* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+	pud_t ent = __pud(_PAGE_TABLE | __pa(pmd));
+
+	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+	if (PagePinned(virt_to_page(pud)))
+		set_pud(pud, ent);
+	else
+		*pud = ent;
+}
+#endif	/* CONFIG_X86_PAE */
+
+#if PAGETABLE_LEVELS > 3
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+	pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud));
+
+	paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+	if (unlikely(PagePinned(virt_to_page(pgd))))
+		xen_l4_entry_update(pgd, ent);
+	else
+		*__user_pgd(pgd) = *pgd = ent;
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return (pud_t *)pmd_alloc_one(mm, addr);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+	__pmd_free(virt_to_page(pud));
+}
+
+extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
+				  unsigned long address)
+{
+	___pud_free_tlb(tlb, pud);
+}
+
+#endif	/* PAGETABLE_LEVELS > 3 */
+#endif	/* PAGETABLE_LEVELS > 2 */
+
+#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable-3level.h b/arch/x86/include/mach-xen/asm/pgtable-3level.h
new file mode 100644
index 0000000..71e906c
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable-3level.h
@@ -0,0 +1,152 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_H
+#define _ASM_X86_PGTABLE_3LEVEL_H
+
+/*
+ * Intel Physical Address Extension (PAE) Mode - three-level page
+ * tables on PPro+ CPUs.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#define pte_ERROR(e)							\
+	printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n",		\
+	        __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e)							\
+	printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pmd_val(e),			\
+	       (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e)							\
+	printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pgd_val(e),			\
+	       (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+/* Rules for using set_pte: the pte being assigned *must* be
+ * either not present or in a state where the hardware will
+ * not attempt to update the pte.  In places where this is
+ * not possible, use pte_get_and_clear to obtain the old pte
+ * value and then use set_pte to update it.  -ben
+ */
+
+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+	xen_l2_entry_update(pmdp, pmd);
+}
+
+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+{
+	xen_l3_entry_update(pudp, pud);
+}
+
+/*
+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
+ * entry, so clear the bottom half first and enforce ordering with a compiler
+ * barrier.
+ */
+static inline void __xen_pte_clear(pte_t *ptep)
+{
+	ptep->pte_low = 0;
+	smp_wmb();
+	ptep->pte_high = 0;
+}
+
+#define xen_pmd_clear(pmd)			\
+({						\
+	pmd_t *__pmdp = (pmd);			\
+	PagePinned(virt_to_page(__pmdp))	\
+	? set_pmd(__pmdp, __pmd(0))		\
+	: (void)(*__pmdp = __pmd(0));		\
+})
+
+static inline void __xen_pud_clear(pud_t *pudp)
+{
+	set_pud(pudp, __pud(0));
+
+	/*
+	 * According to Intel App note "TLBs, Paging-Structure Caches,
+	 * and Their Invalidation", April 2007, document 317080-001,
+	 * section 8.1: in PAE mode we explicitly have to flush the
+	 * TLB via cr3 if the top-level pgd is changed...
+	 *
+	 * Currently all places where pud_clear() is called either have
+	 * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
+	 * pud_clear_bad()), so we don't need TLB flush here.
+	 */
+}
+
+#define xen_pud_clear(pudp)			\
+({						\
+	pud_t *__pudp = (pudp);			\
+	PagePinned(virt_to_page(__pudp))	\
+	? __xen_pud_clear(__pudp)		\
+	: (void)(*__pudp = __pud(0));		\
+})
+
+#ifdef CONFIG_SMP
+static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
+{
+	uint64_t val = __pte_val(res);
+	if (__cmpxchg64(&ptep->pte, val, 0) != val) {
+		/* xchg acts as a barrier before the setting of the high bits */
+		res.pte_low = xchg(&ptep->pte_low, 0);
+		res.pte_high = ptep->pte_high;
+		ptep->pte_high = 0;
+	}
+	return res;
+}
+#else
+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+#endif
+
+#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
+			 ((_pte).pte_high << (32-PAGE_SHIFT)))
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_SMP
+union split_pmd {
+	struct {
+		u32 pmd_low;
+		u32 pmd_high;
+	};
+	pmd_t pmd;
+};
+static inline pmd_t xen_pmdp_get_and_clear(pmd_t *pmdp)
+{
+	union split_pmd res, *orig = (union split_pmd *)pmdp;
+
+	/* xchg acts as a barrier before setting of the high bits */
+	res.pmd_low = xchg(&orig->pmd_low, 0);
+	res.pmd_high = orig->pmd_high;
+	orig->pmd_high = 0;
+
+	return res.pmd;
+}
+#else
+#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp)
+#endif
+#endif
+
+/*
+ * Bits 0, 6 and 7 are taken in the low part of the pte,
+ * put the 32 bits of offset into the high part.
+ */
+#define pte_to_pgoff(pte) ((pte).pte_high)
+#define pgoff_to_pte(off)						\
+	((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
+#define PTE_FILE_MAX_BITS       32
+
+/* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
+#define __swp_type(x)			(((x).val) & 0x1f)
+#define __swp_offset(x)			((x).val >> 5)
+#define __swp_entry(type, offset)	((swp_entry_t){(type) | (offset) << 5})
+#define __pte_to_swp_entry(pte)		((swp_entry_t){ (pte).pte_high })
+#define __swp_entry_to_pte(x)		((pte_t){ { .pte_high = (x).val } })
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable-3level_types.h b/arch/x86/include/mach-xen/asm/pgtable-3level_types.h
new file mode 100644
index 0000000..36d6f2b
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable-3level_types.h
@@ -0,0 +1,44 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+typedef u64	pteval_t;
+typedef u64	pmdval_t;
+typedef u64	pudval_t;
+typedef u64	pgdval_t;
+typedef u64	pgprotval_t;
+
+typedef union {
+	struct {
+		unsigned long pte_low, pte_high;
+	};
+	pteval_t pte;
+} pte_t;
+#endif	/* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD	0
+
+#define PAGETABLE_LEVELS	3
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT	30
+#define PTRS_PER_PGD	4
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT	21
+#define PTRS_PER_PMD	512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE	512
+
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable.h b/arch/x86/include/mach-xen/asm/pgtable.h
new file mode 100644
index 0000000..cd43083
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable.h
@@ -0,0 +1,885 @@
+#ifndef _ASM_X86_PGTABLE_H
+#define _ASM_X86_PGTABLE_H
+
+#include <asm/page.h>
+#include <asm/e820.h>
+
+#include <asm/pgtable_types.h>
+
+/*
+ * Macro to mark a page protection value as UC-
+ */
+#define pgprot_noncached(prot)					\
+	((boot_cpu_data.x86 > 3)				\
+	 ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS))	\
+	 : (prot))
+
+#ifndef __ASSEMBLY__
+
+#include <asm/x86_init.h>
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+extern spinlock_t pgd_lock;
+extern struct list_head pgd_list;
+
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
+#define set_pte(ptep, pte)		xen_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte)	xen_set_pte_at(mm, addr, ptep, pte)
+#define set_pmd_at(mm, addr, pmdp, pmd)	xen_set_pmd_at(mm, addr, pmdp, pmd)
+
+#define set_pmd(pmdp, pmd)		xen_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd)		xen_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd)			xen_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud)		xen_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud)			xen_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep)	xen_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd)			xen_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep)              do { } while (0)
+#define pte_update_defer(mm, addr, ptep)        do { } while (0)
+#define pmd_update(mm, addr, ptep)              do { } while (0)
+#define pmd_update_defer(mm, addr, ptep)        do { } while (0)
+
+#define pgd_val(x)	xen_pgd_val(x)
+#define __pgd(x)	xen_make_pgd(x)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x)	xen_pud_val(x)
+#define __pud(x)	xen_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x)	xen_pmd_val(x)
+#define __pmd(x)	xen_make_pmd(x)
+#endif
+
+#define pte_val(x)	xen_pte_val(x)
+#define __pte(x)	xen_make_pte(x)
+
+#define arch_end_context_switch(prev)	do {} while(0)
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_dirty(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_DIRTY;
+}
+
+static inline int pte_young(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_ACCESSED;
+}
+
+static inline int pmd_young(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
+static inline int pte_write(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_RW;
+}
+
+static inline int pte_file(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_FILE;
+}
+
+static inline int pte_huge(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_PSE;
+}
+
+static inline int pte_global(pte_t pte)
+{
+	return 0;
+}
+
+static inline int pte_exec(pte_t pte)
+{
+	return !(pte_flags(pte) & _PAGE_NX);
+}
+
+static inline int pte_special(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_SPECIAL;
+}
+
+#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
+	__pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
+#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IOMAP ? max_mapnr : \
+		       (_pte).pte_low & _PAGE_PRESENT ?		  \
+		       mfn_to_local_pfn(__pte_mfn(_pte)) :	  \
+		       __pte_mfn(_pte))
+
+#define pte_page(pte)	pfn_to_page(pte_pfn(pte))
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+	return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline int pmd_large(pmd_t pte)
+{
+	return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+		(_PAGE_PSE | _PAGE_PRESENT);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	return pmd_val(pmd) & _PAGE_PSE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+	return cpu_has_pse;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
+{
+	pteval_t v = __pte_val(pte);
+
+	return __pte_ma(v | set);
+}
+
+static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
+{
+	pteval_t v = __pte_val(pte);
+
+	return __pte_ma(v & ~clear);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_RW);
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_NX);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_RW);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_PSE);
+}
+
+static inline pte_t pte_clrhuge(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_PSE);
+}
+
+static inline pte_t pte_mkglobal(pte_t pte)
+{
+	return pte;
+}
+
+static inline pte_t pte_clrglobal(pte_t pte)
+{
+	return pte;
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+	return pte_set_flags(pte, _PAGE_SPECIAL);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+	pmdval_t v = native_pmd_val(pmd);
+
+	return __pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+	pmdval_t v = native_pmd_val(pmd);
+
+	return __pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+#endif
+
+/*
+ * Mask out unsupported bits in a present pgprot.  Non-present pgprots
+ * can use those bits for other purposes, so leave them be.
+ */
+static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
+{
+	pgprotval_t protval = pgprot_val(pgprot);
+
+	if (protval & _PAGE_PRESENT)
+		protval &= __supported_pte_mask;
+
+	return protval;
+}
+
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
+		     massage_pgprot(pgprot));
+}
+
+static inline pte_t pfn_pte_ma(phys_addr_t page_nr, pgprot_t pgprot)
+{
+	return __pte_ma((page_nr << PAGE_SHIFT) | massage_pgprot(pgprot));
+}
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+	return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
+		     massage_pgprot(pgprot));
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+	pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
+
+	val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
+
+	return __pte(val);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+	pmdval_t val = pmd_val(pmd);
+
+	val &= _HPAGE_CHG_MASK;
+	val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+
+	return __pmd(val);
+}
+#endif
+
+/* mprotect needs to preserve PAT bits when updating vm_page_prot */
+#define pgprot_modify pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+	pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
+	pgprotval_t addbits = pgprot_val(newprot);
+	return __pgprot(preservebits | addbits);
+}
+
+#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
+
+#define canon_pgprot(p) __pgprot(massage_pgprot(p))
+
+static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
+					 unsigned long flags,
+					 unsigned long new_flags)
+{
+	/*
+	 * PAT type is always WB for untracked ranges, so no need to check.
+	 */
+	if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
+		return 1;
+
+	/*
+	 * Certain new memtypes are not allowed with certain
+	 * requested memtype:
+	 * - request is uncached, return cannot be write-back
+	 * - request is write-combine, return cannot be write-back
+	 */
+	if ((flags == _PAGE_CACHE_UC_MINUS &&
+	     new_flags == _PAGE_CACHE_WB) ||
+	    (flags == _PAGE_CACHE_WC &&
+	     new_flags == _PAGE_CACHE_WB)) {
+		return 0;
+	}
+
+	return 1;
+}
+
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
+#endif	/* __ASSEMBLY__ */
+
+#ifdef CONFIG_X86_32
+# include "pgtable_32.h"
+#else
+# include "pgtable_64.h"
+#endif
+
+#ifndef __ASSEMBLY__
+#include <linux/mm_types.h>
+
+static inline int pte_none(pte_t pte)
+{
+	return !pte.pte;
+}
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+	return a.pte == b.pte;
+}
+
+static inline int pte_present(pte_t a)
+{
+	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
+static inline int pte_hidden(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_HIDDEN;
+}
+
+static inline int pmd_present(pmd_t pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+   can temporarily clear it. */
+	return __pmd_val(pmd) != 0;
+#else
+	return pmd_flags(pmd) & _PAGE_PRESENT;
+#endif
+}
+
+static inline int pmd_none(pmd_t pmd)
+{
+	/* Only check low word on 32-bit platforms, since it might be
+	   out of sync with upper half. */
+	return (unsigned long)__pmd_val(pmd) == 0;
+}
+
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+	return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pmd_page(pmd)	pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+static inline unsigned long pmd_index(unsigned long address)
+{
+	return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
+}
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * (Currently stuck as a macro because of indirect forward reference
+ * to linux/mm.h:page_to_nid())
+ */
+#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
+
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this function returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+static inline unsigned long pte_index(unsigned long address)
+{
+	return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+}
+
+static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
+{
+	return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+	return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT)
+	       != (_KERNPG_TABLE & ~_PAGE_PRESENT);
+#else
+	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+#endif
+}
+
+static inline unsigned long pages_to_mb(unsigned long npg)
+{
+	return npg >> (20 - PAGE_SHIFT);
+}
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
+	direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
+
+#if PAGETABLE_LEVELS > 2
+static inline int pud_none(pud_t pud)
+{
+	return __pud_val(pud) == 0;
+}
+
+static inline int pud_present(pud_t pud)
+{
+	return pud_flags(pud) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pud_page_vaddr(pud_t pud)
+{
+	return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pud_page(pud)		pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+
+/* Find an entry in the second-level page table.. */
+static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+{
+	return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
+}
+
+static inline int pud_large(pud_t pud)
+{
+	return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+		(_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline int pud_bad(pud_t pud)
+{
+	return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+#else
+static inline int pud_large(pud_t pud)
+{
+	return 0;
+}
+#endif	/* PAGETABLE_LEVELS > 2 */
+
+#if PAGETABLE_LEVELS > 3
+static inline int pgd_present(pgd_t pgd)
+{
+	return pgd_flags(pgd) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pgd_page_vaddr(pgd_t pgd)
+{
+	return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pgd_page(pgd)		pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+
+/* to find an entry in a page-table-directory. */
+static inline unsigned long pud_index(unsigned long address)
+{
+	return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
+{
+	return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
+}
+
+static inline int pgd_bad(pgd_t pgd)
+{
+	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+}
+
+static inline int pgd_none(pgd_t pgd)
+{
+	return !__pgd_val(pgd);
+}
+#endif	/* PAGETABLE_LEVELS > 3 */
+
+#endif	/* __ASSEMBLY__ */
+
+/*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+
+/*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+/*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+
+#define KERNEL_PGD_BOUNDARY	pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS		(PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
+#ifndef __ASSEMBLY__
+
+#define direct_gbpages 0
+
+/* local pte updates need not use xchg for locking */
+static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
+{
+	xen_set_pte(ptep, __pte(0));
+	return res;
+}
+
+static inline pmd_t xen_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+	pmd_t res = *pmdp;
+
+	xen_set_pmd(pmdp, __pmd(0));
+	return res;
+}
+
+static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+				  pte_t *ptep , pte_t pte)
+{
+	if ((mm != current->mm && mm != &init_mm) ||
+	    HYPERVISOR_update_va_mapping(addr, pte, 0))
+		xen_set_pte(ptep, pte);
+}
+
+static inline void xen_set_pmd_at(struct mm_struct *mm, unsigned long addr,
+				  pmd_t *pmdp , pmd_t pmd)
+{
+	xen_set_pmd(pmdp, pmd);
+}
+
+static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
+				 pte_t *ptep)
+{
+	if ((mm != current->mm && mm != &init_mm)
+	    || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+		__xen_pte_clear(ptep);
+}
+
+#ifndef CONFIG_PARAVIRT
+/*
+ * Rules for using pte_update - it must be called after any PTE update which
+ * has not been done using the set_pte / clear_pte interfaces.  It is used by
+ * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
+ * updates should either be sets, clears, or set_pte_atomic for P->P
+ * transitions, which means this hook should only be called for user PTEs.
+ * This hook implies a P->P protection or access change has taken place, which
+ * requires a subsequent TLB flush.  The notification can optionally be delayed
+ * until the TLB flush event by using the pte_update_defer form of the
+ * interface, but care must be taken to assure that the flush happens while
+ * still holding the same page table lock so that the shadow and primary pages
+ * do not become out of sync on SMP.
+ */
+#define pte_update(mm, addr, ptep)		do { } while (0)
+#define pte_update_defer(mm, addr, ptep)	do { } while (0)
+#endif
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+struct vm_area_struct;
+
+#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pte_t *ptep,
+				 pte_t entry, int dirty);
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
+#define ptep_clear_flush(vma, addr, ptep)			\
+({								\
+	pte_t *__ptep = (ptep);					\
+	pte_t __res = *__ptep;					\
+	if (!pte_none(__res) &&					\
+	    ((vma)->vm_mm != current->mm ||			\
+	     HYPERVISOR_update_va_mapping(addr,	__pte(0),	\
+			uvm_multi(mm_cpumask((vma)->vm_mm)) |	\
+				UVMF_INVLPG))) {		\
+		__xen_pte_clear(__ptep);			\
+		flush_tlb_page(vma, addr);			\
+	}							\
+	__res;							\
+})
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+				       pte_t *ptep)
+{
+	pte_t pte = *ptep;
+	if (!pte_none(pte)
+	    && (mm != &init_mm
+	        || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
+		pte = xen_ptep_get_and_clear(ptep, pte);
+		pte_update(mm, addr, ptep);
+	}
+	return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+#define ptep_get_and_clear_full(mm, addr, ptep, full)		\
+	((full) ? ({						\
+		pte_t *__ptep = (ptep);				\
+		pte_t __res = *__ptep;				\
+		if (!PagePinned(virt_to_page((mm)->pgd)))	\
+			__xen_pte_clear(__ptep);		\
+		else if (!pte_none(__res))			\
+			xen_l1_entry_update(__ptep, __pte(0));	\
+		__res;						\
+	 }) :							\
+	 ptep_get_and_clear(mm, addr, ptep))
+
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+				      unsigned long addr, pte_t *ptep)
+{
+	pte_t pte = *ptep;
+	if (pte_write(pte))
+		set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+}
+
+#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+
+#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp,
+				 pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+				 unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+				       pmd_t *pmdp)
+{
+	pmd_t pmd = xen_pmdp_get_and_clear(pmdp);
+	pmd_update(mm, addr, pmdp);
+	return pmd;
+}
+#endif
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long addr, pmd_t *pmdp)
+{
+	clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+	pmd_update(mm, addr, pmdp);
+}
+#endif
+
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ *  dst - pointer to pgd range anwhere on a pgd page
+ *  src - ""
+ *  count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+       memcpy(dst, src, count * sizeof(pgd_t));
+}
+
+#define arbitrary_virt_to_mfn(va)					\
+({									\
+	unsigned int __lvl;						\
+	pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl);	\
+	BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
+	pte_mfn(*__ptep);						\
+})
+
+#define arbitrary_virt_to_machine(va)					\
+	(((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT)		\
+	 | ((unsigned long)(va) & (PAGE_SIZE - 1)))
+
+#ifdef CONFIG_HIGHPTE
+#include <asm/io.h>
+struct page *kmap_atomic_to_page(void *);
+#define ptep_to_machine(ptep)						\
+({									\
+	pte_t *__ptep = (ptep);						\
+	page_to_phys(kmap_atomic_to_page(__ptep))			\
+		| ((unsigned long)__ptep & (PAGE_SIZE - 1));		\
+})
+#else
+#define ptep_to_machine(ptep)	virt_to_machine(ptep)
+#endif
+
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+					   pte_t *ptep)
+{
+#if CONFIG_XEN_COMPAT < 0x030300
+	if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
+		return ptep_get_and_clear(mm, addr, ptep);
+#endif
+	return *ptep;
+}
+
+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+					   pte_t *ptep, pte_t pte)
+{
+	mmu_update_t u;
+
+#if CONFIG_XEN_COMPAT < 0x030300
+	if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
+		set_pte_at(mm, addr, ptep, pte);
+		return;
+	}
+#endif
+	u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
+	u.val = __pte_val(pte);
+	if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
+#include <asm-generic/pgtable.h>
+
+#include <xen/features.h>
+void make_page_readonly(void *va, unsigned int feature);
+void make_page_writable(void *va, unsigned int feature);
+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
+
+struct vm_area_struct;
+
+int direct_remap_pfn_range(struct vm_area_struct *vma,
+                           unsigned long address,
+                           phys_addr_t mfn,
+                           unsigned long size,
+                           pgprot_t prot,
+                           domid_t  domid);
+int direct_kernel_remap_pfn_range(unsigned long address,
+				  unsigned long mfn,
+				  unsigned long size,
+				  pgprot_t prot,
+				  domid_t  domid);
+int create_lookup_pte_addr(struct mm_struct *mm,
+                           unsigned long address,
+                           uint64_t *ptep);
+
+#endif	/* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_32.h b/arch/x86/include/mach-xen/asm/pgtable_32.h
new file mode 100644
index 0000000..7d89873
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_32.h
@@ -0,0 +1,89 @@
+#ifndef _ASM_X86_PGTABLE_32_H
+#define _ASM_X86_PGTABLE_32_H
+
+#include <asm/pgtable_32_types.h>
+
+/*
+ * The Linux memory management assumes a three-level page table setup. On
+ * the i386, we use that, but "fold" the mid level into the top-level page
+ * table, so that we physically have the same two-level page table as the
+ * i386 mmu expects.
+ *
+ * This file contains the functions and defines necessary to modify and use
+ * the i386 page table tree.
+ */
+#ifndef __ASSEMBLY__
+#include <asm/processor.h>
+#include <asm/fixmap.h>
+#include <linux/threads.h>
+
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+
+struct vm_area_struct;
+
+extern pgd_t *swapper_pg_dir;
+extern pgd_t initial_page_table[1024];
+
+static inline void pgtable_cache_init(void) { }
+static inline void check_pgt_cache(void) { }
+void paging_init(void);
+
+extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
+
+
+/*
+ * Define this if things work differently on an i386 and an i486:
+ * it will (on an i486) warn about kernel memory accesses that are
+ * done without a 'access_ok(VERIFY_WRITE,..)'
+ */
+#undef TEST_ACCESS_OK
+
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level.h>
+#else
+# include <asm/pgtable-2level.h>
+#endif
+
+#if defined(CONFIG_HIGHPTE)
+#define pte_offset_map(dir, address)					\
+	((pte_t *)kmap_atomic_pte(pmd_page(*(dir))) +		\
+	 pte_index((address)))
+#define pte_unmap(pte) kunmap_atomic((pte))
+#else
+#define pte_offset_map(dir, address)					\
+	((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
+#define pte_unmap(pte) do { } while (0)
+#endif
+
+/* Clear a kernel PTE and flush it from the TLB */
+#define kpte_clear_flush(ptep, vaddr)					\
+do {									\
+	if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
+		BUG(); \
+} while (0)
+
+/*
+ * The i386 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+#define update_mmu_cache(vma, address, ptep) do { } while (0)
+
+void make_lowmem_page_readonly(void *va, unsigned int feature);
+void make_lowmem_page_writable(void *va, unsigned int feature);
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * kern_addr_valid() is (1) for FLATMEM and (0) for
+ * SPARSEMEM and DISCONTIGMEM
+ */
+#ifdef CONFIG_FLATMEM
+#define kern_addr_valid(addr)	(1)
+#else
+#define kern_addr_valid(kaddr)	(0)
+#endif
+
+#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_64.h b/arch/x86/include/mach-xen/asm/pgtable_64.h
new file mode 100644
index 0000000..f58b2ef
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
@@ -0,0 +1,203 @@
+#ifndef _ASM_X86_PGTABLE_64_H
+#define _ASM_X86_PGTABLE_64_H
+
+#include <linux/const.h>
+#include <asm/pgtable_64_types.h>
+
+#ifndef __ASSEMBLY__
+
+/*
+ * This file contains the functions and defines necessary to modify and use
+ * the x86-64 page table tree.
+ */
+#include <asm/processor.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_XEN
+extern pud_t level3_user_pgt[512];
+
+extern void xen_init_pt(void);
+extern void xen_switch_pt(void);
+#endif
+
+extern pud_t level3_kernel_pgt[512];
+extern pud_t level3_ident_pgt[512];
+extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
+extern pgd_t init_level4_pgt[];
+
+#define swapper_pg_dir init_level4_pgt
+
+extern void paging_init(void);
+
+#define pte_ERROR(e)							\
+	printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e)							\
+	printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
+#define pud_ERROR(e)							\
+	printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pud_val(e),			\
+	       (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e)							\
+	printk("%s:%d: bad pgd %p(%016lx pfn %010Lx).\n",		\
+	       __FILE__, __LINE__, &(e), __pgd_val(e),			\
+	       (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+
+struct mm_struct;
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
+
+
+#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
+
+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	*ptep = pte;
+}
+
+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+	xen_l2_entry_update(pmdp, pmd);
+}
+
+#define xen_pmd_clear(pmd)			\
+({						\
+	pmd_t *__pmdp = (pmd);			\
+	PagePinned(virt_to_page(__pmdp))	\
+	? set_pmd(__pmdp, xen_make_pmd(0))	\
+	: (void)(*__pmdp = xen_make_pmd(0));	\
+})
+
+#ifdef CONFIG_SMP
+static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
+{
+	return __pte_ma(xchg(&xp->pte, 0));
+}
+#else
+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_SMP
+static inline pmd_t xen_pmdp_get_and_clear(pmd_t *xp)
+{
+	return xen_make_pmd(xchg(&xp->pmd, 0));
+}
+#else
+#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp)
+#endif
+#endif
+
+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+{
+	xen_l3_entry_update(pudp, pud);
+}
+
+#define xen_pud_clear(pud)			\
+({						\
+	pud_t *__pudp = (pud);			\
+	PagePinned(virt_to_page(__pudp))	\
+	? set_pud(__pudp, xen_make_pud(0))	\
+	: (void)(*__pudp = xen_make_pud(0));	\
+})
+
+static inline pgd_t *__user_pgd(pgd_t *pgd)
+{
+	if (unlikely(((unsigned long)pgd & PAGE_MASK)
+		     == (unsigned long)init_level4_pgt))
+		return NULL;
+	return (pgd_t *)(virt_to_page(pgd)->private
+			 + ((unsigned long)pgd & ~PAGE_MASK));
+}
+
+static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	xen_l4_entry_update(pgdp, pgd);
+}
+
+#define xen_pgd_clear(pgd)			\
+({						\
+	pgd_t *__pgdp = (pgd);			\
+	PagePinned(virt_to_page(__pgdp))	\
+	? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \
+	: (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \
+})
+
+#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+extern unsigned long early_arbitrary_virt_to_mfn(void *va);
+
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+
+/*
+ * Level 4 access.
+ */
+static inline int pgd_large(pgd_t pgd) { return 0; }
+#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+
+/* PUD - Level3 access */
+
+/* PMD  - Level 2 access */
+#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |	\
+					    _PAGE_FILE })
+#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+
+/* PTE - Level 1 access. */
+
+/* x86-64 always has all page tables mapped. */
+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
+#define pte_unmap(pte) ((void)(pte))/* NOP */
+
+#define update_mmu_cache(vma, address, ptep) do { } while (0)
+
+/* Encode and de-code a swap entry */
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#else
+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#endif
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+#define __swp_type(x)			(((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+					 & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x)			((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_entry(type, offset)	((swp_entry_t) { \
+					 ((type) << (_PAGE_BIT_PRESENT + 1)) \
+					 | ((offset) << SWP_OFFSET_SHIFT) })
+#define __pte_to_swp_entry(pte)		((swp_entry_t) { __pte_val(pte) })
+#define __swp_entry_to_pte(x)		((pte_t) { .pte = (x).val })
+
+extern int kern_addr_valid(unsigned long addr);
+
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
+#define pgtable_cache_init()   do { } while (0)
+#define check_pgt_cache()      do { } while (0)
+
+#define PAGE_AGP    PAGE_KERNEL_NOCACHE
+#define HAVE_PAGE_AGP 1
+
+/* fs/proc/kcore.c */
+#define	kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
+#define	kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
+
+#define __HAVE_ARCH_PTE_SAME
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_64_types.h b/arch/x86/include/mach-xen/asm/pgtable_64_types.h
new file mode 100644
index 0000000..c4c4665
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_64_types.h
@@ -0,0 +1,64 @@
+#ifndef _ASM_X86_PGTABLE_64_DEFS_H
+#define _ASM_X86_PGTABLE_64_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long	pteval_t;
+typedef unsigned long	pmdval_t;
+typedef unsigned long	pudval_t;
+typedef unsigned long	pgdval_t;
+typedef unsigned long	pgprotval_t;
+
+typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
+
+#endif	/* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD	0
+#define PAGETABLE_LEVELS	4
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT	39
+#define PTRS_PER_PGD	512
+
+/*
+ * 3rd level page
+ */
+#define PUD_SHIFT	30
+#define PTRS_PER_PUD	512
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT	21
+#define PTRS_PER_PMD	512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE	512
+
+#define PMD_SIZE	(_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK	(~(PMD_SIZE - 1))
+#define PUD_SIZE	(_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE - 1))
+#define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK	(~(PGDIR_SIZE - 1))
+
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+#define MAX_PHYSMEM_BITS 43
+#define MAXMEM		 _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define VMALLOC_START    _AC(0xffffc90000000000, UL)
+#define VMALLOC_END      _AC(0xffffe8ffffffffff, UL)
+#define VMEMMAP_START	 _AC(0xffffea0000000000, UL)
+#define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
+#define MODULES_END      _AC(0xffffffffff000000, UL)
+#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
+
+#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_types.h b/arch/x86/include/mach-xen/asm/pgtable_types.h
new file mode 100644
index 0000000..d0ca475
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
@@ -0,0 +1,392 @@
+#ifndef _ASM_X86_PGTABLE_DEFS_H
+#define _ASM_X86_PGTABLE_DEFS_H
+
+#include <linux/const.h>
+#include <asm/page_types.h>
+
+#define FIRST_USER_ADDRESS	0
+
+#define _PAGE_BIT_PRESENT	0	/* is present */
+#define _PAGE_BIT_RW		1	/* writeable */
+#define _PAGE_BIT_USER		2	/* userspace addressable */
+#define _PAGE_BIT_PWT		3	/* page write through */
+#define _PAGE_BIT_PCD		4	/* page cache disabled */
+#define _PAGE_BIT_ACCESSED	5	/* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY		6	/* was written to (raised by CPU) */
+#define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT		7	/* on 4KB pages */
+#define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
+#define _PAGE_BIT_UNUSED1	9	/* available for programmer */
+#define _PAGE_BIT_IOMAP		10	/* flag used to indicate IO mapping */
+#define _PAGE_BIT_HIDDEN	11	/* hidden by kmemcheck */
+#define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL	_PAGE_BIT_UNUSED1
+#define _PAGE_BIT_CPA_TEST	_PAGE_BIT_UNUSED1
+#define _PAGE_BIT_SPLITTING	_PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
+#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
+
+/* If _PAGE_BIT_PRESENT is clear, we use these: */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
+#define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
+/* - set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
+
+#define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER	(_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT	(_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD	(_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_UNUSED1	(_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_IOMAP	(_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define _PAGE_CPA_TEST	(_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_SPLITTING	(_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
+#define __HAVE_ARCH_PTE_SPECIAL
+
+#ifdef CONFIG_KMEMCHECK
+#define _PAGE_HIDDEN	(_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#else
+#define _PAGE_HIDDEN	(_AT(pteval_t, 0))
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX	(_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+#ifndef __ASSEMBLY__
+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
+extern unsigned int __kernel_page_user;
+#else
+#define __kernel_page_user 0
+#endif
+#endif
+
+#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
+			 _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
+			 _PAGE_DIRTY | __kernel_page_user)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \
+			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+
+/*
+ * PAT settings are part of the hypervisor interface, which sets the
+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
+ */
+#define _PAGE_CACHE_MASK	(_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
+#define _PAGE_CACHE_WB		(0)
+#define _PAGE_CACHE_WT		(_PAGE_PWT)
+#define _PAGE_CACHE_WC		(_PAGE_PAT)
+#define _PAGE_CACHE_WP		(_PAGE_PAT | _PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS	(_PAGE_PCD)
+#define _PAGE_CACHE_UC		(_PAGE_PCD | _PAGE_PWT)
+
+#define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+				 _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_RW |	\
+					 _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED)
+#define PAGE_COPY		PAGE_COPY_NOEXEC
+#define PAGE_READONLY		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
+					 _PAGE_ACCESSED)
+
+#define __PAGE_KERNEL_EXEC						\
+	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
+#define __PAGE_KERNEL		(__PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE	(__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC		(__PAGE_KERNEL | _PAGE_CACHE_WC)
+#define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS		(__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR		(__PAGE_KERNEL_RO | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR_NOCACHE	(__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_NOCACHE	(__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+#define __PAGE_KERNEL_IO		(__PAGE_KERNEL | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_UC_MINUS	(__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_WC		(__PAGE_KERNEL_WC | _PAGE_IOMAP)
+
+#define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC		__pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX			__pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC			__pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE		__pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS		__pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE	__pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE		__pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_NOCACHE	__pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+#define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VVAR		__pgprot(__PAGE_KERNEL_VVAR)
+#define PAGE_KERNEL_VVAR_NOCACHE	__pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
+
+#define PAGE_KERNEL_IO			__pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE		__pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#define PAGE_KERNEL_IO_UC_MINUS		__pgprot(__PAGE_KERNEL_IO_UC_MINUS)
+#define PAGE_KERNEL_IO_WC		__pgprot(__PAGE_KERNEL_IO_WC)
+
+/*         xwr */
+#define __P000	PAGE_NONE
+#define __P001	PAGE_READONLY
+#define __P010	PAGE_COPY
+#define __P011	PAGE_COPY
+#define __P100	PAGE_READONLY_EXEC
+#define __P101	PAGE_READONLY_EXEC
+#define __P110	PAGE_COPY_EXEC
+#define __P111	PAGE_COPY_EXEC
+
+#define __S000	PAGE_NONE
+#define __S001	PAGE_READONLY
+#define __S010	PAGE_SHARED
+#define __S011	PAGE_SHARED
+#define __S100	PAGE_READONLY_EXEC
+#define __S101	PAGE_READONLY_EXEC
+#define __S110	PAGE_SHARED_EXEC
+#define __S111	PAGE_SHARED_EXEC
+
+/*
+ * early identity mapping  pte attrib macros.
+ */
+#ifdef CONFIG_X86_64
+#define __PAGE_KERNEL_IDENT_LARGE_EXEC	__PAGE_KERNEL_LARGE_EXEC
+#else
+/*
+ * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
+ * bits are combined, this will alow user to access the high address mapped
+ * VDSO in the presence of CONFIG_COMPAT_VDSO
+ */
+#define PTE_IDENT_ATTR	 0x003		/* PRESENT+RW */
+#define PDE_IDENT_ATTR	 0x067		/* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PGD_IDENT_ATTR	 0x001		/* PRESENT (no other attributes) */
+#endif
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32_types.h>
+#else
+# include "pgtable_64_types.h"
+#endif
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+#define PTE_PFN_MASK		((pteval_t)PHYSICAL_PAGE_MASK)
+
+/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+#define PTE_FLAGS_MASK		(~PTE_PFN_MASK)
+
+typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
+
+#include <asm/maddr.h>
+
+typedef struct { pgdval_t pgd; } pgd_t;
+
+#define __pgd_ma(x) ((pgd_t) { (x) } )
+static inline pgd_t xen_make_pgd(pgdval_t val)
+{
+	if (likely(val & _PAGE_PRESENT))
+		val = pte_phys_to_machine(val);
+	return (pgd_t) { val };
+}
+
+#define __pgd_val(x) ((x).pgd)
+static inline pgdval_t xen_pgd_val(pgd_t pgd)
+{
+	pgdval_t ret = __pgd_val(pgd);
+#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
+	if (likely(ret))
+		ret = machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+	if (likely(ret & _PAGE_PRESENT))
+		ret = pte_machine_to_phys(ret);
+#endif
+	return ret;
+}
+
+static inline pgdval_t pgd_flags(pgd_t pgd)
+{
+	return __pgd_val(pgd) & PTE_FLAGS_MASK;
+}
+
+#if PAGETABLE_LEVELS > 3
+typedef struct { pudval_t pud; } pud_t;
+
+#define __pud_ma(x) ((pud_t) { (x) } )
+static inline pud_t xen_make_pud(pudval_t val)
+{
+	if (likely(val & _PAGE_PRESENT))
+		val = pte_phys_to_machine(val);
+	return (pud_t) { val };
+}
+
+#define __pud_val(x) ((x).pud)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+	pudval_t ret = __pud_val(pud);
+	if (likely(ret & _PAGE_PRESENT))
+		ret = pte_machine_to_phys(ret);
+	return ret;
+}
+#else
+#include <asm-generic/pgtable-nopud.h>
+
+#define __pud_val(x) __pgd_val((x).pgd)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+	return xen_pgd_val(pud.pgd);
+}
+#endif
+
+#if PAGETABLE_LEVELS > 2
+typedef struct { pmdval_t pmd; } pmd_t;
+
+#define __pmd_ma(x)	((pmd_t) { (x) } )
+static inline pmd_t xen_make_pmd(pmdval_t val)
+{
+	if (likely(val & _PAGE_PRESENT))
+		val = pte_phys_to_machine(val);
+	return (pmd_t) { val };
+}
+
+#define __pmd_val(x) ((x).pmd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+	pmdval_t ret = __pmd_val(pmd);
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (likely(ret))
+		ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+	if (likely(ret & _PAGE_PRESENT))
+		ret = pte_machine_to_phys(ret);
+#endif
+	return ret;
+}
+#else
+#include <asm-generic/pgtable-nopmd.h>
+
+#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
+#define __pmd_val(x) __pgd_val((x).pud.pgd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+	return xen_pgd_val(pmd.pud.pgd);
+}
+#endif
+
+static inline pudval_t pud_flags(pud_t pud)
+{
+	return __pud_val(pud) & PTE_FLAGS_MASK;
+}
+
+static inline pmdval_t pmd_flags(pmd_t pmd)
+{
+	return __pmd_val(pmd) & PTE_FLAGS_MASK;
+}
+
+#define __pte_ma(x) ((pte_t) { .pte = (x) } )
+static inline pte_t xen_make_pte(pteval_t val)
+{
+	if (likely((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
+		val = pte_phys_to_machine(val);
+	return (pte_t) { .pte = val };
+}
+
+#define __pte_val(x) ((x).pte)
+static inline pteval_t xen_pte_val(pte_t pte)
+{
+	pteval_t ret = __pte_val(pte);
+	if (likely((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
+		ret = pte_machine_to_phys(ret);
+	return ret;
+}
+
+static inline pteval_t pte_flags(pte_t pte)
+{
+	return __pte_val(pte) & PTE_FLAGS_MASK;
+}
+
+#define pgprot_val(x)	((x).pgprot)
+#define __pgprot(x)	((pgprot_t) { (x) } )
+
+
+typedef struct page *pgtable_t;
+
+extern pteval_t __supported_pte_mask;
+extern void set_nx(void);
+extern int nx_enabled;
+
+#define pgprot_writecombine	pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
+#ifndef CONFIG_XEN
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+#endif
+
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t *vma_prot);
+
+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+extern void xen_pagetable_reserve(u64 start, u64 end);
+
+struct seq_file;
+extern void arch_report_meminfo(struct seq_file *m);
+
+enum {
+	PG_LEVEL_NONE,
+	PG_LEVEL_4K,
+	PG_LEVEL_2M,
+	PG_LEVEL_1G,
+	PG_LEVEL_NUM
+};
+
+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+
+#endif	/* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/mach-xen/asm/probe_roms.h b/arch/x86/include/mach-xen/asm/probe_roms.h
new file mode 100644
index 0000000..da90d01
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/probe_roms.h
@@ -0,0 +1,10 @@
+#if !defined(CONFIG_XEN_UNPRIVILEGED_GUEST)
+# include_next <asm/probe_roms.h>
+#elif !defined(_PROBE_ROMS_H_)
+# define _PROBE_ROMS_H_
+struct pci_dev;
+
+static inline void __iomem *pci_map_biosrom(struct pci_dev *pdev) { return NULL; }
+static inline void pci_unmap_biosrom(void __iomem *rom) { }
+static inline size_t pci_biosrom_size(struct pci_dev *pdev) { return 0; }
+#endif
diff --git a/arch/x86/include/mach-xen/asm/processor.h b/arch/x86/include/mach-xen/asm/processor.h
new file mode 100644
index 0000000..242e146
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/processor.h
@@ -0,0 +1,978 @@
+#ifndef _ASM_X86_PROCESSOR_H
+#define _ASM_X86_PROCESSOR_H
+
+#include <asm/processor-flags.h>
+
+/* Forward declaration, a strange C thing */
+struct task_struct;
+struct mm_struct;
+
+#include <asm/vm86.h>
+#include <asm/math_emu.h>
+#include <asm/segment.h>
+#include <asm/types.h>
+#include <asm/sigcontext.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/pgtable_types.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/desc_defs.h>
+#include <asm/nops.h>
+
+#include <linux/personality.h>
+#include <linux/cpumask.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/math64.h>
+#include <linux/init.h>
+#include <linux/err.h>
+
+#include <xen/interface/physdev.h>
+
+#define HBP_NUM 4
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter").
+ */
+static inline void *current_text_addr(void)
+{
+	void *pc;
+
+	asm volatile("mov $1f, %0; 1:":"=r" (pc));
+
+	return pc;
+}
+
+#ifdef CONFIG_X86_VSMP
+# define ARCH_MIN_TASKALIGN		(1 << INTERNODE_CACHE_SHIFT)
+# define ARCH_MIN_MMSTRUCT_ALIGN	(1 << INTERNODE_CACHE_SHIFT)
+#else
+# define ARCH_MIN_TASKALIGN		16
+# define ARCH_MIN_MMSTRUCT_ALIGN	0
+#endif
+
+/*
+ *  CPU type and hardware bug flags. Kept separately for each CPU.
+ *  Members of this structure are referenced in head.S, so think twice
+ *  before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+	__u8			x86;		/* CPU family */
+	__u8			x86_vendor;	/* CPU vendor */
+	__u8			x86_model;
+	__u8			x86_mask;
+#ifdef CONFIG_X86_32
+	char			wp_works_ok;	/* It doesn't on 386's */
+
+	/* Problems on some 486Dx4's and old 386's: */
+#ifndef CONFIG_XEN
+	char			hlt_works_ok;
+#endif
+	char			hard_math;
+#ifndef CONFIG_XEN
+	char			rfu;
+	char			fdiv_bug;
+	char			f00f_bug;
+	char			coma_bug;
+	char			pad0;
+#endif
+#else
+	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
+	int			x86_tlbsize;
+#endif
+	__u8			x86_virt_bits;
+	__u8			x86_phys_bits;
+#ifndef CONFIG_XEN
+	/* CPUID returned core id bits: */
+	__u8			x86_coreid_bits;
+#endif
+	/* Max extended CPUID function supported: */
+	__u32			extended_cpuid_level;
+	/* Maximum supported CPUID level, -1=no CPUID: */
+	int			cpuid_level;
+	__u32			x86_capability[NCAPINTS];
+	char			x86_vendor_id[16];
+	char			x86_model_id[64];
+	/* in KB - valid for CPUS which support this call: */
+	int			x86_cache_size;
+	int			x86_cache_alignment;	/* In bytes */
+	int			x86_power;
+	unsigned long		loops_per_jiffy;
+#ifndef CONFIG_XEN
+	/* cpuid returned max cores value: */
+	u16			 x86_max_cores;
+	u16			apicid;
+	u16			initial_apicid;
+#endif
+	u16			x86_clflush_size;
+#ifndef CONFIG_XEN
+	/* number of cores as seen by the OS: */
+	u16			booted_cores;
+	/* Physical processor id: */
+	u16			phys_proc_id;
+	/* Core id: */
+	u16			cpu_core_id;
+	/* Compute unit id */
+	u8			compute_unit_id;
+#endif
+	/* Index into per_cpu list: */
+	u16			cpu_index;
+#ifndef CONFIG_XEN
+	u32			microcode;
+#endif
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define X86_VENDOR_INTEL	0
+#define X86_VENDOR_CYRIX	1
+#define X86_VENDOR_AMD		2
+#define X86_VENDOR_UMC		3
+#define X86_VENDOR_CENTAUR	5
+#define X86_VENDOR_TRANSMETA	7
+#define X86_VENDOR_NSC		8
+#define X86_VENDOR_NUM		9
+
+#define X86_VENDOR_UNKNOWN	0xff
+
+/*
+ * capabilities of CPUs
+ */
+extern struct cpuinfo_x86	boot_cpu_data;
+extern struct cpuinfo_x86	new_cpu_data;
+
+extern __u32			cpu_caps_cleared[NCAPINTS];
+extern __u32			cpu_caps_set[NCAPINTS];
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu)		per_cpu(cpu_info, cpu)
+#else
+#define cpu_info		boot_cpu_data
+#define cpu_data(cpu)		boot_cpu_data
+#endif
+
+extern const struct seq_operations cpuinfo_op;
+
+static inline int hlt_works(int cpu)
+{
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+	return cpu_data(cpu).hlt_works_ok;
+#else
+	return 1;
+#endif
+}
+
+#define cache_line_size()	(boot_cpu_data.x86_cache_alignment)
+
+extern void cpu_detect(struct cpuinfo_x86 *c);
+
+extern struct pt_regs *idle_regs(struct pt_regs *);
+
+extern void early_cpu_init(void);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void print_cpu_info(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+extern void detect_extended_topology(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
+
+static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+			     unsigned int *ecx, unsigned int *edx)
+{
+	/* ecx is often an input as well as an output. */
+	asm volatile(XEN_CPUID
+	    : "=a" (*eax),
+	      "=b" (*ebx),
+	      "=c" (*ecx),
+	      "=d" (*edx)
+	    : "0" (*eax), "2" (*ecx)
+	    : "memory");
+}
+
+static inline void load_cr3(pgd_t *pgdir)
+{
+	write_cr3(__pa(pgdir));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+	unsigned short		back_link, __blh;
+	unsigned long		sp0;
+	unsigned short		ss0, __ss0h;
+	unsigned long		sp1;
+	/* ss1 caches MSR_IA32_SYSENTER_CS: */
+	unsigned short		ss1, __ss1h;
+	unsigned long		sp2;
+	unsigned short		ss2, __ss2h;
+	unsigned long		__cr3;
+	unsigned long		ip;
+	unsigned long		flags;
+	unsigned long		ax;
+	unsigned long		cx;
+	unsigned long		dx;
+	unsigned long		bx;
+	unsigned long		sp;
+	unsigned long		bp;
+	unsigned long		si;
+	unsigned long		di;
+	unsigned short		es, __esh;
+	unsigned short		cs, __csh;
+	unsigned short		ss, __ssh;
+	unsigned short		ds, __dsh;
+	unsigned short		fs, __fsh;
+	unsigned short		gs, __gsh;
+	unsigned short		ldt, __ldth;
+	unsigned short		trace;
+	unsigned short		io_bitmap_base;
+
+} __attribute__((packed));
+extern struct tss_struct doublefault_tss;
+#else
+struct x86_hw_tss {
+	u32			reserved1;
+	u64			sp0;
+	u64			sp1;
+	u64			sp2;
+	u64			reserved2;
+	u64			ist[7];
+	u32			reserved3;
+	u32			reserved4;
+	u16			reserved5;
+	u16			io_bitmap_base;
+
+} __attribute__((packed)) ____cacheline_aligned;
+#endif
+#endif /* CONFIG_X86_NO_TSS */
+
+/*
+ * IO-bitmap sizes:
+ */
+#define IO_BITMAP_BITS			65536
+#define IO_BITMAP_BYTES			(IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS			(IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET		offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET	0x8000
+
+#ifndef CONFIG_X86_NO_TSS
+struct tss_struct {
+	/*
+	 * The hardware state:
+	 */
+	struct x86_hw_tss	x86_tss;
+
+	/*
+	 * The extra 1 is there because the CPU will access an
+	 * additional byte beyond the end of the IO permission
+	 * bitmap. The extra byte must be all 1 bits, and must
+	 * be within the limit.
+	 */
+	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
+
+	/*
+	 * .. and then another 0x100 bytes for the emergency kernel stack:
+	 */
+	unsigned long		stack[64];
+
+} ____cacheline_aligned;
+
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+
+/*
+ * Save the original ist values for checking stack pointers during debugging
+ */
+struct orig_ist {
+	unsigned long		ist[7];
+};
+#endif /* CONFIG_X86_NO_TSS */
+
+#define	MXCSR_DEFAULT		0x1f80
+
+struct i387_fsave_struct {
+	u32			cwd;	/* FPU Control Word		*/
+	u32			swd;	/* FPU Status Word		*/
+	u32			twd;	/* FPU Tag Word			*/
+	u32			fip;	/* FPU IP Offset		*/
+	u32			fcs;	/* FPU IP Selector		*/
+	u32			foo;	/* FPU Operand Pointer Offset	*/
+	u32			fos;	/* FPU Operand Pointer Selector	*/
+
+	/* 8*10 bytes for each FP-reg = 80 bytes:			*/
+	u32			st_space[20];
+
+	/* Software status information [not touched by FSAVE ]:		*/
+	u32			status;
+};
+
+struct i387_fxsave_struct {
+	u16			cwd; /* Control Word			*/
+	u16			swd; /* Status Word			*/
+	u16			twd; /* Tag Word			*/
+	u16			fop; /* Last Instruction Opcode		*/
+	union {
+		struct {
+			u64	rip; /* Instruction Pointer		*/
+			u64	rdp; /* Data Pointer			*/
+		};
+		struct {
+			u32	fip; /* FPU IP Offset			*/
+			u32	fcs; /* FPU IP Selector			*/
+			u32	foo; /* FPU Operand Offset		*/
+			u32	fos; /* FPU Operand Selector		*/
+		};
+	};
+	u32			mxcsr;		/* MXCSR Register State */
+	u32			mxcsr_mask;	/* MXCSR Mask		*/
+
+	/* 8*16 bytes for each FP-reg = 128 bytes:			*/
+	u32			st_space[32];
+
+	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
+	u32			xmm_space[64];
+
+	u32			padding[12];
+
+	union {
+		u32		padding1[12];
+		u32		sw_reserved[12];
+	};
+
+} __attribute__((aligned(16)));
+
+struct i387_soft_struct {
+	u32			cwd;
+	u32			swd;
+	u32			twd;
+	u32			fip;
+	u32			fcs;
+	u32			foo;
+	u32			fos;
+	/* 8*10 bytes for each FP-reg = 80 bytes: */
+	u32			st_space[20];
+	u8			ftop;
+	u8			changed;
+	u8			lookahead;
+	u8			no_update;
+	u8			rm;
+	u8			alimit;
+	struct math_emu_info	*info;
+	u32			entry_eip;
+};
+
+struct ymmh_struct {
+	/* 16 * 16 bytes for each YMMH-reg = 256 bytes */
+	u32 ymmh_space[64];
+};
+
+struct xsave_hdr_struct {
+	u64 xstate_bv;
+	u64 reserved1[2];
+	u64 reserved2[5];
+} __attribute__((packed));
+
+struct xsave_struct {
+	struct i387_fxsave_struct i387;
+	struct xsave_hdr_struct xsave_hdr;
+	struct ymmh_struct ymmh;
+	/* new processor state extensions will go here */
+} __attribute__ ((packed, aligned (64)));
+
+union thread_xstate {
+	struct i387_fsave_struct	fsave;
+	struct i387_fxsave_struct	fxsave;
+	struct i387_soft_struct		soft;
+	struct xsave_struct		xsave;
+};
+
+struct fpu {
+	unsigned int last_cpu;
+	unsigned int has_fpu;
+	union thread_xstate *state;
+};
+
+#ifdef CONFIG_X86_64
+#ifndef CONFIG_X86_NO_TSS
+DECLARE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+union irq_stack_union {
+	char irq_stack[IRQ_STACK_SIZE];
+	/*
+	 * GCC hardcodes the stack canary as %gs:40.  Since the
+	 * irq_stack is the object at %gs:0, we reserve the bottom
+	 * 48 bytes of the irq stack for the canary.
+	 */
+	struct {
+		char gs_base[40];
+		unsigned long stack_canary;
+	};
+};
+
+DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
+DECLARE_INIT_PER_CPU(irq_stack_union);
+
+DECLARE_PER_CPU(char *, irq_stack_ptr);
+DECLARE_PER_CPU(unsigned int, irq_count);
+extern unsigned long kernel_eflags;
+extern asmlinkage void ignore_sysret(void);
+#else	/* X86_64 */
+#ifdef CONFIG_CC_STACKPROTECTOR
+/*
+ * Make sure stack canary segment base is cached-aligned:
+ *   "For Intel Atom processors, avoid non zero segment base address
+ *    that is not aligned to cache line boundary at all cost."
+ * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
+ */
+struct stack_canary {
+	char __pad[20];		/* canary at %gs:20 */
+	unsigned long canary;
+};
+DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+#endif
+#endif	/* X86_64 */
+
+extern unsigned int xstate_size;
+extern void free_thread_xstate(struct task_struct *);
+extern struct kmem_cache *task_xstate_cachep;
+
+struct perf_event;
+
+struct thread_struct {
+	/* Cached TLS descriptors: */
+	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
+	unsigned long		sp0;
+	unsigned long		sp;
+#ifdef CONFIG_X86_32
+	unsigned long		sysenter_cs;
+#else
+	unsigned short		es;
+	unsigned short		ds;
+	unsigned short		fsindex;
+	unsigned short		gsindex;
+#endif
+#ifdef CONFIG_X86_32
+	unsigned long		ip;
+#endif
+#ifdef CONFIG_X86_64
+	unsigned long		fs;
+#endif
+	unsigned long		gs;
+	/* Save middle states of ptrace breakpoints */
+	struct perf_event	*ptrace_bps[HBP_NUM];
+	/* Debug status used for traps, single steps, etc... */
+	unsigned long           debugreg6;
+	/* Keep track of the exact dr7 value set by the user */
+	unsigned long           ptrace_dr7;
+	/* Fault info: */
+	unsigned long		cr2;
+	unsigned long		trap_no;
+	unsigned long		error_code;
+	/* floating point and extended processor state */
+	struct fpu		fpu;
+#ifdef CONFIG_X86_32
+	/* Virtual 86 mode info */
+	struct vm86_struct __user *vm86_info;
+	unsigned long		screen_bitmap;
+	unsigned long		v86flags, v86mask, saved_sp0;
+	unsigned int		saved_fs, saved_gs;
+#endif
+	/* IO permissions: */
+	unsigned long		*io_bitmap_ptr;
+	unsigned long		iopl;
+	/* Max allowed port in the bitmap, in bytes: */
+	unsigned		io_bitmap_max;
+};
+
+static inline unsigned long xen_get_debugreg(int regno)
+{
+	return HYPERVISOR_get_debugreg(regno);
+}
+
+static inline void xen_set_debugreg(int regno, unsigned long value)
+{
+	WARN_ON(HYPERVISOR_set_debugreg(regno, value));
+}
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void xen_set_iopl_mask(unsigned mask)
+{
+	struct physdev_set_iopl set_iopl;
+
+	/* Force the change at ring 0. */
+	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+static inline void
+native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+{
+	tss->x86_tss.sp0 = thread->sp0;
+#ifdef CONFIG_X86_32
+	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
+	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+		tss->x86_tss.ss1 = thread->sysenter_cs;
+		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+	}
+#endif
+}
+#else
+#define xen_load_sp0(tss, thread) do { \
+	if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
+		BUG(); \
+} while (0)
+#endif
+
+#define __cpuid			xen_cpuid
+#define paravirt_enabled()	1
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register)				\
+	(var) = xen_get_debugreg(register)
+#define set_debugreg(value, register)				\
+	xen_set_debugreg(register, value)
+
+#define load_sp0 xen_load_sp0
+
+#define set_iopl_mask xen_set_iopl_mask
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+extern unsigned long		mmu_cr4_features;
+
+static inline void set_in_cr4(unsigned long mask)
+{
+	unsigned long cr4;
+
+	mmu_cr4_features |= mask;
+	cr4 = read_cr4();
+	cr4 |= mask;
+	write_cr4(cr4);
+}
+
+static inline void clear_in_cr4(unsigned long mask)
+{
+	unsigned long cr4;
+
+	mmu_cr4_features &= ~mask;
+	cr4 = read_cr4();
+	cr4 &= ~mask;
+	write_cr4(cr4);
+}
+
+typedef struct {
+	unsigned long		seg;
+} mm_segment_t;
+
+
+/*
+ * create a kernel thread without removing it from tasklists
+ */
+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+
+/* Free all resources held by a thread. */
+extern void release_thread(struct task_struct *);
+
+/* Prepare to copy thread state - unlazy all lazy state */
+extern void prepare_to_copy(struct task_struct *tsk);
+
+unsigned long get_wchan(struct task_struct *p);
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+			 unsigned int *eax, unsigned int *ebx,
+			 unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = 0;
+	__cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+			       unsigned int *eax, unsigned int *ebx,
+			       unsigned int *ecx, unsigned int *edx)
+{
+	*eax = op;
+	*ecx = count;
+	__cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+
+	return eax;
+}
+
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+
+	return ebx;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+
+	return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid(op, &eax, &ebx, &ecx, &edx);
+
+	return edx;
+}
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+	asm volatile("rep; nop" ::: "memory");
+}
+
+static inline void cpu_relax(void)
+{
+	rep_nop();
+}
+
+/* Stop speculative execution and prefetching of modified code. */
+static inline void sync_core(void)
+{
+	int tmp;
+
+#if defined(CONFIG_M386) || defined(CONFIG_M486)
+	if (boot_cpu_data.x86 < 5)
+		/* There is no speculative execution.
+		 * jmp is a barrier to prefetching. */
+		asm volatile("jmp 1f\n1:\n" ::: "memory");
+	else
+#endif
+		/* cpuid is a barrier to speculative execution.
+		 * Prefetched instructions are automatically
+		 * invalidated when modified. */
+		asm volatile("cpuid" : "=a" (tmp) : "0" (1)
+			     : "ebx", "ecx", "edx", "memory");
+}
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+			     unsigned long edx)
+{
+	/* "monitor %eax, %ecx, %edx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xc8;"
+		     :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+	/* "mwait %eax, %ecx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xc9;"
+		     :: "a" (eax), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+	trace_hardirqs_on();
+	/* "mwait %eax, %ecx;" */
+	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+		     :: "a" (eax), "c" (ecx));
+}
+
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+extern void init_amd_e400_c1e_mask(void);
+
+extern unsigned long		boot_option_idle_override;
+extern bool			amd_e400_c1e_detected;
+
+enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
+			 IDLE_POLL, IDLE_FORCE_MWAIT};
+
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+extern void early_trap_init(void);
+
+/* Defined in head.S */
+extern struct desc_ptr		early_gdt_descr;
+
+extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(int);
+extern void load_percpu_segment(int);
+extern void cpu_init(void);
+
+static inline unsigned long get_debugctlmsr(void)
+{
+	unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return 0;
+#endif
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+	return debugctlmsr;
+}
+
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return;
+#endif
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}
+
+/*
+ * from system description table in BIOS. Mostly for MCA use, but
+ * others may find it useful:
+ */
+extern unsigned int		machine_id;
+extern unsigned int		machine_submodel_id;
+extern unsigned int		BIOS_revision;
+
+/* Boot loader type from the setup header: */
+extern int			bootloader_type;
+extern int			bootloader_version;
+
+extern char			ignore_fpu_irq;
+
+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+#ifdef CONFIG_X86_32
+# define BASE_PREFETCH		ASM_NOP4
+# define ARCH_HAS_PREFETCH
+#else
+# define BASE_PREFETCH		"prefetcht0 (%1)"
+#endif
+
+/*
+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
+ *
+ * It's not worth to care about 3dnow prefetches for the K6
+ * because they are microcoded there and very slow.
+ */
+static inline void prefetch(const void *x)
+{
+	alternative_input(BASE_PREFETCH,
+			  "prefetchnta (%1)",
+			  X86_FEATURE_XMM,
+			  "r" (x));
+}
+
+/*
+ * 3dnow prefetch to get an exclusive cache line.
+ * Useful for spinlocks to avoid one state transition in the
+ * cache coherency protocol:
+ */
+static inline void prefetchw(const void *x)
+{
+	alternative_input(BASE_PREFETCH,
+			  "prefetchw (%1)",
+			  X86_FEATURE_3DNOW,
+			  "r" (x));
+}
+
+static inline void spin_lock_prefetch(const void *x)
+{
+	prefetchw(x);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * User space process size: 3GB (default).
+ */
+#define TASK_SIZE		PAGE_OFFSET
+#define TASK_SIZE_MAX		TASK_SIZE
+#define STACK_TOP		TASK_SIZE
+#define STACK_TOP_MAX		STACK_TOP
+
+#define INIT_THREAD  {							  \
+	.sp0			= sizeof(init_stack) + (long)&init_stack, \
+	.vm86_info		= NULL,					  \
+	.sysenter_cs		= __KERNEL_CS,				  \
+	.io_bitmap_ptr		= NULL,					  \
+}
+
+/*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+#define INIT_TSS  {							  \
+	.x86_tss = {							  \
+		.sp0		= sizeof(init_stack) + (long)&init_stack, \
+		.ss0		= __KERNEL_DS,				  \
+		.ss1		= __KERNEL_CS,				  \
+		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,		  \
+	 },								  \
+	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },	  \
+}
+
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
+#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
+#define KSTK_TOP(info)                                                 \
+({                                                                     \
+       unsigned long *__ptr = (unsigned long *)(info);                 \
+       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
+})
+
+/*
+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * This is necessary to guarantee that the entire "struct pt_regs"
+ * is accessible even if the CPU haven't stored the SS/ESP registers
+ * on the stack (interrupt gate does not save these registers
+ * when switching to the same priv ring).
+ * Therefore beware: accessing the ss/esp fields of the
+ * "struct pt_regs" is possible, but they may contain the
+ * completely wrong values.
+ */
+#define task_pt_regs(task)                                             \
+({                                                                     \
+       struct pt_regs *__regs__;                                       \
+       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+       __regs__ - 1;                                                   \
+})
+
+#else
+/*
+ * User space process size. 47bits minus one guard page.
+ */
+#define TASK_SIZE_MAX	((1UL << 47) - PAGE_SIZE)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET	((current->personality & ADDR_LIMIT_3GB) ? \
+					0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE		(test_thread_flag(TIF_IA32) ? \
+					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+#define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_IA32)) ? \
+					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+
+#define STACK_TOP		TASK_SIZE
+#define STACK_TOP_MAX		TASK_SIZE_MAX
+
+#define INIT_THREAD  { \
+	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define INIT_TSS  { \
+	.x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+#define thread_saved_pc(t)	(*(unsigned long *)((t)->thread.sp - 8))
+
+#define task_pt_regs(tsk)	((struct pt_regs *)(tsk)->thread.sp0 - 1)
+#endif /* CONFIG_X86_64 */
+
+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
+					       unsigned long new_sp);
+
+/*
+ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define TASK_UNMAPPED_BASE	(PAGE_ALIGN(TASK_SIZE / 3))
+
+#define KSTK_EIP(task)		(task_pt_regs(task)->ip)
+#define KSTK_ESP(task)		(task_pt_regs(task)->sp)
+
+/* Get/set a process' ability to use the timestamp counter instruction */
+#define GET_TSC_CTL(adr)	get_tsc_mode((adr))
+#define SET_TSC_CTL(val)	set_tsc_mode((val))
+
+extern int get_tsc_mode(unsigned long adr);
+extern int set_tsc_mode(unsigned int val);
+
+extern int amd_get_nb_id(int cpu);
+
+struct aperfmperf {
+	u64 aperf, mperf;
+};
+
+static inline void get_aperfmperf(struct aperfmperf *am)
+{
+	WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
+
+	rdmsrl(MSR_IA32_APERF, am->aperf);
+	rdmsrl(MSR_IA32_MPERF, am->mperf);
+}
+
+#define APERFMPERF_SHIFT 10
+
+static inline
+unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
+				    struct aperfmperf *new)
+{
+	u64 aperf = new->aperf - old->aperf;
+	u64 mperf = new->mperf - old->mperf;
+	unsigned long ratio = aperf;
+
+	mperf >>= APERFMPERF_SHIFT;
+	if (mperf)
+		ratio = div64_u64(aperf, mperf);
+
+	return ratio;
+}
+
+/*
+ * AMD errata checking
+ */
+#ifdef CONFIG_CPU_SUP_AMD
+extern const int amd_erratum_383[];
+extern const int amd_erratum_400[];
+extern bool cpu_has_amd_erratum(const int *);
+
+#define AMD_LEGACY_ERRATUM(...)		{ -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...)	{ osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+	((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range)	(((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range)	(((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range)	((range) & 0xfff)
+
+#else
+#define cpu_has_amd_erratum(x)	(false)
+#endif /* CONFIG_CPU_SUP_AMD */
+
+#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/mach-xen/asm/setup.h b/arch/x86/include/mach-xen/asm/setup.h
new file mode 100644
index 0000000..aaa418c
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/setup.h
@@ -0,0 +1,21 @@
+#ifndef __ASSEMBLY__
+
+void xen_start_kernel(void);
+void xen_arch_setup(void);
+
+#ifdef CONFIG_X86_64
+void reserve_pfn_range(unsigned long pfn, unsigned long nr);
+void reserve_pgtable_low(void);
+#endif
+
+extern unsigned long xen_initrd_start;
+
+#ifdef CONFIG_EFI
+void efi_probe(void);
+#else
+#define efi_probe() ((void)0)
+#endif
+
+#endif
+
+#include_next <asm/setup.h>
diff --git a/arch/x86/include/mach-xen/asm/smp-processor-id.h b/arch/x86/include/mach-xen/asm/smp-processor-id.h
new file mode 100644
index 0000000..c6c1ec5
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/smp-processor-id.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_SMP_PROCESSOR_ID_H
+#define _ASM_X86_SMP_PROCESSOR_ID_H
+
+#if defined(CONFIG_SMP) && !defined(__ASSEMBLY__)
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU(int, cpu_number);
+
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+#define raw_smp_processor_id() percpu_read(cpu_number)
+#define safe_smp_processor_id() smp_processor_id()
+
+#ifdef CONFIG_X86_64_SMP
+#define stack_smp_processor_id()					\
+({									\
+	struct thread_info *ti;						\
+	__asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));	\
+	ti->cpu;							\
+})
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+extern unsigned int debug_smp_processor_id(void);
+# define smp_processor_id() debug_smp_processor_id()
+#else
+# define smp_processor_id() raw_smp_processor_id()
+#endif
+
+#endif /* SMP && !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_SMP_PROCESSOR_ID_H */
diff --git a/arch/x86/include/mach-xen/asm/smp.h b/arch/x86/include/mach-xen/asm/smp.h
new file mode 100644
index 0000000..76f78bf
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/smp.h
@@ -0,0 +1,241 @@
+#ifndef _ASM_X86_SMP_H
+#define _ASM_X86_SMP_H
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <asm/percpu.h>
+
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+#  include <asm/io_apic.h>
+# endif
+#endif
+#include <linux/thread_info.h>
+#include <asm/cpumask.h>
+#include <asm/cpufeature.h>
+
+extern unsigned int num_processors;
+
+#ifndef CONFIG_XEN
+static inline bool cpu_has_ht_siblings(void)
+{
+	bool has_siblings = false;
+#ifdef CONFIG_SMP
+	has_siblings = cpu_has_ht && smp_num_siblings > 1;
+#endif
+	return has_siblings;
+}
+
+DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
+/* cpus sharing the last level cache: */
+DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU(u16, cpu_llc_id);
+DECLARE_PER_CPU(int, cpu_number);
+#endif
+
+static inline const struct cpumask *cpu_sibling_mask(int cpu)
+{
+	return cpumask_of(cpu);
+}
+
+static inline const struct cpumask *cpu_core_mask(int cpu)
+{
+	return cpumask_of(cpu);
+}
+
+#ifndef CONFIG_XEN
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+	return per_cpu(cpu_llc_shared_map, cpu);
+}
+
+DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
+DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
+#endif
+#endif
+
+#ifdef CONFIG_SMP
+
+#ifndef CONFIG_XEN
+
+/* Static state in head.S used to set up a CPU */
+extern unsigned long stack_start; /* Initial stack pointer address */
+
+struct smp_ops {
+	void (*smp_prepare_boot_cpu)(void);
+	void (*smp_prepare_cpus)(unsigned max_cpus);
+	void (*smp_cpus_done)(unsigned max_cpus);
+
+	void (*stop_other_cpus)(int wait);
+	void (*smp_send_reschedule)(int cpu);
+
+	int (*cpu_up)(unsigned cpu);
+	int (*cpu_disable)(void);
+	void (*cpu_die)(unsigned int cpu);
+	void (*play_dead)(void);
+
+	void (*send_call_func_ipi)(const struct cpumask *mask);
+	void (*send_call_func_single_ipi)(int cpu);
+};
+
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
+extern struct smp_ops smp_ops;
+
+static inline void smp_send_stop(void)
+{
+	smp_ops.stop_other_cpus(0);
+}
+
+static inline void stop_other_cpus(void)
+{
+	smp_ops.stop_other_cpus(1);
+}
+
+static inline void smp_prepare_boot_cpu(void)
+{
+	smp_ops.smp_prepare_boot_cpu();
+}
+
+static inline void smp_prepare_cpus(unsigned int max_cpus)
+{
+	smp_ops.smp_prepare_cpus(max_cpus);
+}
+
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+	smp_ops.smp_cpus_done(max_cpus);
+}
+
+static inline int __cpu_up(unsigned int cpu)
+{
+	return smp_ops.cpu_up(cpu);
+}
+
+static inline int __cpu_disable(void)
+{
+	return smp_ops.cpu_disable();
+}
+
+static inline void __cpu_die(unsigned int cpu)
+{
+	smp_ops.cpu_die(cpu);
+}
+
+static inline void play_dead(void)
+{
+	smp_ops.play_dead();
+}
+
+static inline void smp_send_reschedule(int cpu)
+{
+	smp_ops.smp_send_reschedule(cpu);
+}
+
+static inline void arch_send_call_function_single_ipi(int cpu)
+{
+	smp_ops.send_call_func_single_ipi(cpu);
+}
+
+static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
+{
+	smp_ops.send_call_func_ipi(mask);
+}
+
+void cpu_disable_common(void);
+void native_smp_prepare_boot_cpu(void);
+void native_smp_prepare_cpus(unsigned int max_cpus);
+void native_smp_cpus_done(unsigned int max_cpus);
+int native_cpu_up(unsigned int cpunum);
+int native_cpu_disable(void);
+void native_cpu_die(unsigned int cpu);
+void native_play_dead(void);
+void play_dead_common(void);
+void wbinvd_on_cpu(int cpu);
+int wbinvd_on_all_cpus(void);
+
+void smp_store_cpu_info(int id);
+#define cpu_physical_id(cpu)	per_cpu(x86_cpu_to_apicid, cpu)
+
+#else /* CONFIG_XEN */
+
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
+void xen_stop_other_cpus(int wait);
+void xen_smp_send_reschedule(int cpu);
+void xen_send_call_func_ipi(const struct cpumask *mask);
+void xen_send_call_func_single_ipi(int cpu);
+
+static inline void smp_send_stop(void)
+{
+	xen_stop_other_cpus(0);
+}
+
+#define smp_send_reschedule	xen_smp_send_reschedule
+#define arch_send_call_function_single_ipi	xen_send_call_func_single_ipi
+#define arch_send_call_function_ipi_mask	xen_send_call_func_ipi
+
+void play_dead(void);
+
+#endif /* CONFIG_XEN */
+
+/* We don't mark CPUs online until __cpu_up(), so we need another measure */
+static inline int num_booting_cpus(void)
+{
+	return cpumask_weight(cpu_callout_mask);
+}
+#elif /* !CONFIG_SMP && */ !defined(CONFIG_XEN)
+#define wbinvd_on_cpu(cpu)     wbinvd()
+static inline int wbinvd_on_all_cpus(void)
+{
+	wbinvd();
+	return 0;
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_XEN
+int wbinvd_on_all_cpus(void);
+#endif
+
+extern unsigned disabled_cpus __cpuinitdata;
+
+#include <asm/smp-processor-id.h>
+
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+
+#ifndef CONFIG_X86_64
+static inline int logical_smp_processor_id(void)
+{
+	/* we don't want to mark this access volatile - bad code generation */
+	return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+}
+
+#endif
+
+extern int hard_smp_processor_id(void);
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+# ifndef CONFIG_SMP
+#  define hard_smp_processor_id()	0
+# endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/mach-xen/asm/spinlock.h b/arch/x86/include/mach-xen/asm/spinlock.h
new file mode 100644
index 0000000..af304bb
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/spinlock.h
@@ -0,0 +1,379 @@
+#ifndef _ASM_X86_SPINLOCK_H
+#define _ASM_X86_SPINLOCK_H
+
+#include <linux/atomic.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+
+/*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+ *
+ * Simple spin lock operations.  There are two variants, one clears IRQ's
+ * on the local processor, one does not.
+ *
+ * These are fair FIFO ticket locks, which are currently limited to 256
+ * CPUs.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
+ */
+
+#ifdef CONFIG_X86_32
+# define LOCK_PTR_REG "a"
+# define REG_PTR_MODE "k"
+#else
+# define LOCK_PTR_REG "D"
+# define REG_PTR_MODE "q"
+#endif
+
+#if defined(CONFIG_XEN) || (defined(CONFIG_X86_32) && \
+	(defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)))
+/*
+ * On Xen, as we read back the result of the unlocking increment, we must use
+ * a locked access (or insert a full memory barrier) in all cases (so that we
+ * read what is globally visible).
+ *
+ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
+ * (PPro errata 66, 92)
+ */
+# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
+#else
+# define UNLOCK_LOCK_PREFIX
+#endif
+
+#ifdef TICKET_SHIFT
+
+#include <asm/irqflags.h>
+#include <asm/smp-processor-id.h>
+
+int xen_spinlock_init(unsigned int cpu);
+void xen_spinlock_cleanup(unsigned int cpu);
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+struct __raw_tickets xen_spin_adjust(const arch_spinlock_t *,
+				     struct __raw_tickets);
+#else
+#define xen_spin_adjust(lock, raw_tickets) (raw_tickets)
+#define xen_spin_wait(l, t, f) xen_spin_wait(l, t)
+#endif
+unsigned int xen_spin_wait(arch_spinlock_t *, struct __raw_tickets *,
+			   unsigned int flags);
+void xen_spin_kick(const arch_spinlock_t *, unsigned int ticket);
+
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an xadd covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide xadd increment of the low part would carry
+ * up and contaminate the high part.
+ */
+#define __spin_count_dec(c, l) (vcpu_running((l)->owner) ? --(c) : ((c) >>= 1))
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
+{
+	struct __raw_tickets inc = { .tail = 1 };
+	unsigned int count, flags = arch_local_irq_save();
+
+	inc = xadd(&lock->tickets, inc);
+	if (likely(inc.head == inc.tail))
+		arch_local_irq_restore(flags);
+	else {
+		inc = xen_spin_adjust(lock, inc);
+		arch_local_irq_restore(flags);
+		count = 1 << 12;
+		do {
+			while (inc.head != inc.tail
+			       && __spin_count_dec(count, lock)) {
+				cpu_relax();
+				inc.head = ACCESS_ONCE(lock->tickets.head);
+			}
+		} while (unlikely(!count)
+			 && (count = xen_spin_wait(lock, &inc, flags)));
+	}
+	barrier();		/* make sure nothing creeps before the lock is taken */
+	lock->owner = raw_smp_processor_id();
+}
+#else
+#define __ticket_spin_lock(lock) __ticket_spin_lock_flags(lock, -1)
+#endif
+
+static __always_inline void __ticket_spin_lock_flags(arch_spinlock_t *lock,
+						     unsigned long flags)
+{
+	struct __raw_tickets inc = { .tail = 1 };
+
+	inc = xadd(&lock->tickets, inc);
+	if (unlikely(inc.head != inc.tail)) {
+		unsigned int count = 1 << 12;
+
+		inc = xen_spin_adjust(lock, inc);
+		do {
+			while (inc.head != inc.tail
+			       && __spin_count_dec(count, lock)) {
+				cpu_relax();
+				inc.head = ACCESS_ONCE(lock->tickets.head);
+			}
+		} while (unlikely(!count)
+			 && (count = xen_spin_wait(lock, &inc, flags)));
+	}
+	barrier();		/* make sure nothing creeps before the lock is taken */
+	lock->owner = raw_smp_processor_id();
+}
+
+#undef __spin_count_dec
+
+static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
+{
+	arch_spinlock_t old;
+
+	old.tickets = ACCESS_ONCE(lock->tickets);
+	if (old.tickets.head != old.tickets.tail)
+		return 0;
+
+	/* cmpxchg is a full barrier, so nothing can move before it */
+	if (cmpxchg(&lock->head_tail, old.head_tail,
+		    old.head_tail + (1 << TICKET_SHIFT)) != old.head_tail)
+		return 0;
+	lock->owner = raw_smp_processor_id();
+	return 1;
+}
+
+static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+{
+	register struct __raw_tickets new;
+
+	__add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
+#if !defined(XEN_SPINLOCK_SOURCE) || !CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+# undef UNLOCK_LOCK_PREFIX
+#endif
+	new = ACCESS_ONCE(lock->tickets);
+	if (new.head != new.tail)
+		xen_spin_kick(lock, new.head);
+}
+
+static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
+{
+	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+	return tmp.tail != tmp.head;
+}
+
+static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
+{
+	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+	return (__ticket_t)(tmp.tail - tmp.head) > 1;
+}
+
+#define __arch_spin(n) __ticket_spin_##n
+
+#else /* TICKET_SHIFT */
+
+static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
+static inline void xen_spinlock_cleanup(unsigned int cpu) {}
+
+static inline int __byte_spin_is_locked(arch_spinlock_t *lock)
+{
+	return lock->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(arch_spinlock_t *lock)
+{
+	return lock->spinners != 0;
+}
+
+static inline void __byte_spin_lock(arch_spinlock_t *lock)
+{
+	s8 val = 1;
+
+	asm("1: xchgb %1, %0\n"
+	    "   test %1,%1\n"
+	    "   jz 3f\n"
+	    "   " LOCK_PREFIX "incb %2\n"
+	    "2: rep;nop\n"
+	    "   cmpb $1, %0\n"
+	    "   je 2b\n"
+	    "   " LOCK_PREFIX "decb %2\n"
+	    "   jmp 1b\n"
+	    "3:"
+	    : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory");
+}
+
+#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
+
+static inline int __byte_spin_trylock(arch_spinlock_t *lock)
+{
+	u8 old = 1;
+
+	asm("xchgb %1,%0"
+	    : "+m" (lock->lock), "+q" (old) : : "memory");
+
+	return old == 0;
+}
+
+static inline void __byte_spin_unlock(arch_spinlock_t *lock)
+{
+	smp_wmb();
+	lock->lock = 0;
+}
+
+#define __arch_spin(n) __byte_spin_##n
+
+#endif /* TICKET_SHIFT */
+
+#if defined(CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING) \
+    && CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+void xen_spin_irq_enter(void);
+void xen_spin_irq_exit(void);
+#else
+static inline void xen_spin_irq_enter(void) {}
+static inline void xen_spin_irq_exit(void) {}
+#endif
+
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+	return __arch_spin(is_locked)(lock);
+}
+
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
+{
+	return __arch_spin(is_contended)(lock);
+}
+#define arch_spin_is_contended	arch_spin_is_contended
+
+static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+	__arch_spin(lock)(lock);
+}
+
+static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+	return __arch_spin(trylock)(lock);
+}
+
+static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+	__arch_spin(unlock)(lock);
+}
+
+static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
+						  unsigned long flags)
+{
+	__arch_spin(lock_flags)(lock, flags);
+}
+
+#undef __arch_spin
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	while (arch_spin_is_locked(lock))
+		cpu_relax();
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ *
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ */
+
+/**
+ * read_can_lock - would read_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int arch_read_can_lock(arch_rwlock_t *lock)
+{
+	return lock->lock > 0;
+}
+
+/**
+ * write_can_lock - would write_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int arch_write_can_lock(arch_rwlock_t *lock)
+{
+	return lock->write == WRITE_LOCK_CMP;
+}
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+	asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
+		     "jns 1f\n"
+		     "call __read_lock_failed\n\t"
+		     "1:\n"
+		     ::LOCK_PTR_REG (rw) : "memory");
+}
+
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+	asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
+		     "jz 1f\n"
+		     "call __write_lock_failed\n\t"
+		     "1:\n"
+		     ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
+		     : "memory");
+}
+
+static inline int arch_read_trylock(arch_rwlock_t *lock)
+{
+	READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
+
+	if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
+		return 1;
+	READ_LOCK_ATOMIC(inc)(count);
+	return 0;
+}
+
+static inline int arch_write_trylock(arch_rwlock_t *lock)
+{
+	atomic_t *count = (atomic_t *)&lock->write;
+
+	if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
+		return 1;
+	atomic_add(WRITE_LOCK_CMP, count);
+	return 0;
+}
+
+static inline void arch_read_unlock(arch_rwlock_t *rw)
+{
+	asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
+		     :"+m" (rw->lock) : : "memory");
+}
+
+static inline void arch_write_unlock(arch_rwlock_t *rw)
+{
+	asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
+		     : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
+}
+
+#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
+#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
+
+#undef READ_LOCK_SIZE
+#undef READ_LOCK_ATOMIC
+#undef WRITE_LOCK_ADD
+#undef WRITE_LOCK_SUB
+#undef WRITE_LOCK_CMP
+
+#define arch_spin_relax(lock)	cpu_relax()
+#define arch_read_relax(lock)	cpu_relax()
+#define arch_write_relax(lock)	cpu_relax()
+
+/* The {read|write|spin}_lock() on x86 are full memory barriers. */
+static inline void smp_mb__after_lock(void) { }
+#define ARCH_HAS_SMP_MB_AFTER_LOCK
+
+#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/mach-xen/asm/spinlock_types.h b/arch/x86/include/mach-xen/asm/spinlock_types.h
new file mode 100644
index 0000000..d78bbc0
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/spinlock_types.h
@@ -0,0 +1,62 @@
+#ifndef _ASM_X86_SPINLOCK_TYPES_H
+#define _ASM_X86_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+#include <linux/types.h>
+
+#ifdef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+/*
+ * On Xen we support CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING levels of
+ * interrupt re-enabling per IRQ-safe lock. Hence we can have
+ * (CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING + 1) times as many outstanding
+ * tickets. Thus the cut-off for using byte register pairs must be at
+ * a sufficiently smaller number of CPUs.
+ */
+#if (CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING + 1) * CONFIG_NR_CPUS < 256
+typedef u8  __ticket_t;
+# define TICKET_SHIFT 8
+typedef u16 __ticketpair_t;
+#else
+typedef u16 __ticket_t;
+# define TICKET_SHIFT 16
+typedef u32 __ticketpair_t;
+#endif
+
+typedef union {
+	__ticketpair_t head_tail;
+	struct {
+		struct __raw_tickets {
+			__ticket_t head, tail;
+		} tickets;
+#if CONFIG_NR_CPUS <= 256
+		u8 owner;
+#else
+		u16 owner;
+#endif
+	};
+#else /* ndef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING */
+typedef struct {
+/*
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure.  It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+	u8 lock;
+#if CONFIG_NR_CPUS < 256
+	u8 spinners;
+#else
+# error NR_CPUS >= 256 not implemented
+#endif
+#endif /* def CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING */
+} arch_spinlock_t;
+
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+
+#include <asm/rwlock.h>
+
+#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/mach-xen/asm/swiotlb.h b/arch/x86/include/mach-xen/asm/swiotlb.h
new file mode 100644
index 0000000..e82aad1
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/swiotlb.h
@@ -0,0 +1,8 @@
+#include_next <asm/swiotlb.h>
+
+#ifndef CONFIG_SWIOTLB
+#define swiotlb_init(verbose) ((void)(verbose))
+#endif
+
+dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
+				   int dir);
diff --git a/arch/x86/include/mach-xen/asm/system.h b/arch/x86/include/mach-xen/asm/system.h
new file mode 100644
index 0000000..09ef84b
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/system.h
@@ -0,0 +1,520 @@
+#ifndef _ASM_X86_SYSTEM_H
+#define _ASM_X86_SYSTEM_H
+
+#include <asm/asm.h>
+#include <asm/segment.h>
+#include <asm/cpufeature.h>
+#include <asm/cmpxchg.h>
+#include <asm/nops.h>
+#include <asm/hypervisor.h>
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+
+/* entries in ARCH_DLINFO: */
+#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
+# define AT_VECTOR_SIZE_ARCH 2
+#else /* else it's non-compat x86-64 */
+# define AT_VECTOR_SIZE_ARCH 1
+#endif
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+struct task_struct *__switch_to(struct task_struct *prev,
+				struct task_struct *next);
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
+extern void show_regs_common(void);
+
+#ifdef CONFIG_X86_32
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary							\
+	"movl %P[task_canary](%[next]), %%ebx\n\t"			\
+	"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
+#define __switch_canary_oparam						\
+	, [stack_canary] "=m" (stack_canary.canary)
+#define __switch_canary_iparam						\
+	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else	/* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif	/* CC_STACKPROTECTOR */
+
+/*
+ * Saving eflags is important. It switches not only IOPL between tasks,
+ * it also protects other tasks from NT leaking through sysenter etc.
+ */
+#define switch_to(prev, next, last)					\
+do {									\
+	/*								\
+	 * Context-switching clobbers all registers, so we clobber	\
+	 * them explicitly, via unused output variables.		\
+	 * (EAX and EBP is not listed because EBP is saved/restored	\
+	 * explicitly for wchan access and EAX is the return value of	\
+	 * __switch_to())						\
+	 */								\
+	unsigned long ebx, ecx, edx, esi, edi;				\
+									\
+	asm volatile("pushfl\n\t"		/* save    flags */	\
+		     "pushl %%ebp\n\t"		/* save    EBP   */	\
+		     "movl %%esp,%[prev_sp]\n\t"	/* save    ESP   */ \
+		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
+		     "movl $1f,%[prev_ip]\n\t"	/* save    EIP   */	\
+		     "pushl %[next_ip]\n\t"	/* restore EIP   */	\
+		     __switch_canary					\
+		     "jmp __switch_to\n"	/* regparm call  */	\
+		     "1:\t"						\
+		     "popl %%ebp\n\t"		/* restore EBP   */	\
+		     "popfl\n"			/* restore flags */	\
+									\
+		     /* output parameters */				\
+		     : [prev_sp] "=m" (prev->thread.sp),		\
+		       [prev_ip] "=m" (prev->thread.ip),		\
+		       "=a" (last),					\
+									\
+		       /* clobbered output registers: */		\
+		       "=b" (ebx), "=c" (ecx), "=d" (edx),		\
+		       "=S" (esi), "=D" (edi)				\
+		       							\
+		       __switch_canary_oparam				\
+									\
+		       /* input parameters: */				\
+		     : [next_sp]  "m" (next->thread.sp),		\
+		       [next_ip]  "m" (next->thread.ip),		\
+		       							\
+		       /* regparm parameters for __switch_to(): */	\
+		       [prev]     "a" (prev),				\
+		       [next]     "d" (next)				\
+									\
+		       __switch_canary_iparam				\
+									\
+		     : /* reloaded segment registers */			\
+			"memory");					\
+} while (0)
+
+#ifndef CONFIG_XEN
+/*
+ * disable hlt during certain critical i/o operations
+ */
+#define HAVE_DISABLE_HLT
+#endif
+#else
+
+/* frame pointer must be last for get_wchan */
+#define SAVE_CONTEXT    "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
+
+#define __EXTRA_CLOBBER  \
+	, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
+	  "r12", "r13", "r14", "r15"
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+#define __switch_canary							  \
+	"movq %P[task_canary](%%rsi),%%r8\n\t"				  \
+	"movq %%r8,"__percpu_arg([gs_canary])"\n\t"
+#define __switch_canary_oparam						  \
+	, [gs_canary] "=m" (irq_stack_union.stack_canary)
+#define __switch_canary_iparam						  \
+	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
+#else	/* CC_STACKPROTECTOR */
+#define __switch_canary
+#define __switch_canary_oparam
+#define __switch_canary_iparam
+#endif	/* CC_STACKPROTECTOR */
+
+/* The stack unwind code needs this but it pollutes traces otherwise */
+#ifdef CONFIG_UNWIND_INFO
+#define THREAD_RETURN_SYM \
+	".globl thread_return\n" \
+	"thread_return:\n\t"
+#else
+#define THREAD_RETURN_SYM
+#endif
+
+/* Save restore flags to clear handle leaking NT */
+#define switch_to(prev, next, last) \
+	asm volatile(SAVE_CONTEXT					  \
+	     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */	  \
+	     "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */	  \
+	     "call __switch_to\n\t"					  \
+	     THREAD_RETURN_SYM						  \
+	     "movq "__percpu_arg([current_task])",%%rsi\n\t"		  \
+	     __switch_canary						  \
+	     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
+	     "movq %%rax,%%rdi\n\t" 					  \
+	     "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"		  \
+	     "jnz   ret_from_fork\n\t"					  \
+	     RESTORE_CONTEXT						  \
+	     : "=a" (last)					  	  \
+	       __switch_canary_oparam					  \
+	     : [next] "S" (next), [prev] "D" (prev),			  \
+	       [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
+	       [ti_flags] "i" (offsetof(struct thread_info, flags)),	  \
+	       [_tif_fork] "i" (_TIF_FORK),			  	  \
+	       [thread_info] "i" (offsetof(struct task_struct, stack)),   \
+	       [current_task] "m" (current_task)			  \
+	       __switch_canary_iparam					  \
+	     : "memory", "cc" __EXTRA_CLOBBER)
+#endif
+
+#ifdef __KERNEL__
+
+extern void xen_load_gs_index(unsigned);
+
+/*
+ * Load a segment. Fall back on loading the zero
+ * segment if something goes wrong..
+ */
+#define loadsegment(seg, value)						\
+do {									\
+	unsigned short __val = (value);					\
+									\
+	asm volatile("						\n"	\
+		     "1:	movl %k0,%%" #seg "		\n"	\
+									\
+		     ".section .fixup,\"ax\"			\n"	\
+		     "2:	xorl %k0,%k0			\n"	\
+		     "		jmp 1b				\n"	\
+		     ".previous					\n"	\
+									\
+		     _ASM_EXTABLE(1b, 2b)				\
+									\
+		     : "+r" (__val) : : "memory");			\
+} while (0)
+
+/*
+ * Save a segment register away
+ */
+#define savesegment(seg, value)				\
+	asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+
+/*
+ * x86_32 user gs accessors.
+ */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_32_LAZY_GS
+#define get_user_gs(regs)	(u16)({unsigned long v; savesegment(gs, v); v;})
+#define set_user_gs(regs, v)	loadsegment(gs, (unsigned long)(v))
+#define task_user_gs(tsk)	((tsk)->thread.gs)
+#define lazy_save_gs(v)		savesegment(gs, (v))
+#define lazy_load_gs(v)		loadsegment(gs, (v))
+#else	/* X86_32_LAZY_GS */
+#define get_user_gs(regs)	(u16)((regs)->gs)
+#define set_user_gs(regs, v)	do { (regs)->gs = (v); } while (0)
+#define task_user_gs(tsk)	(task_pt_regs(tsk)->gs)
+#define lazy_save_gs(v)		do { } while (0)
+#define lazy_load_gs(v)		do { } while (0)
+#endif	/* X86_32_LAZY_GS */
+#endif	/* X86_32 */
+
+static inline unsigned long get_limit(unsigned long segment)
+{
+	unsigned long __limit;
+	asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
+	return __limit + 1;
+}
+
+static inline void xen_clts(void)
+{
+	HYPERVISOR_fpu_taskswitch(0);
+}
+
+static inline void xen_stts(void)
+{
+	HYPERVISOR_fpu_taskswitch(1);
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long xen_read_cr0(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+	return val;
+}
+
+static inline void xen_write_cr0(unsigned long val)
+{
+	asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
+}
+
+#define xen_read_cr2() vcpu_info_read(arch.cr2)
+#define xen_write_cr2(val) vcpu_info_write(arch.cr2, val)
+
+static inline unsigned long xen_read_cr3(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
+#ifdef CONFIG_X86_32
+	return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
+#else
+	return machine_to_phys(val);
+#endif
+}
+
+static inline void xen_write_cr3(unsigned long val)
+{
+#ifdef CONFIG_X86_32
+	val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
+#else
+	val = phys_to_machine(val);
+#endif
+	asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long xen_read_cr4(void)
+{
+	unsigned long val;
+	asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
+	return val;
+}
+
+#define xen_read_cr4_safe() xen_read_cr4()
+
+static inline void xen_write_cr4(unsigned long val)
+{
+	asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+	return 0;
+}
+
+static inline void xen_write_cr8(unsigned long val)
+{
+	BUG_ON(val);
+}
+#endif
+
+static inline void xen_wbinvd(void)
+{
+	asm volatile("wbinvd": : :"memory");
+}
+
+static inline unsigned long read_cr0(void)
+{
+	return xen_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+	xen_write_cr0(x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+	return xen_read_cr2();
+}
+
+static inline void write_cr2(unsigned long x)
+{
+	xen_write_cr2(x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+	return xen_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+	xen_write_cr3(x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+	return xen_read_cr4();
+}
+
+static inline unsigned long read_cr4_safe(void)
+{
+	return xen_read_cr4_safe();
+}
+
+static inline void write_cr4(unsigned long x)
+{
+	xen_write_cr4(x);
+}
+
+static inline void wbinvd(void)
+{
+	xen_wbinvd();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long read_cr8(void)
+{
+	return xen_read_cr8();
+}
+
+static inline void write_cr8(unsigned long x)
+{
+	xen_write_cr8(x);
+}
+
+static inline void load_gs_index(unsigned selector)
+{
+	xen_load_gs_index(selector);
+}
+
+#endif
+
+/* Clear the 'TS' bit */
+static inline void clts(void)
+{
+	xen_clts();
+}
+
+static inline void stts(void)
+{
+	xen_stts();
+}
+
+#endif /* __KERNEL__ */
+
+static inline void clflush(volatile void *__p)
+{
+	asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+#define nop() asm volatile ("nop")
+
+void disable_hlt(void);
+void enable_hlt(void);
+
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void xen_idle(void);
+bool set_pm_idle_to_default(void);
+
+void stop_this_cpu(void *dummy);
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+#ifdef CONFIG_X86_32
+/*
+ * Some non-Intel clones support out of order store. wmb() ceases to be a
+ * nop for these.
+ */
+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#else
+#define mb() 	asm volatile("mfence":::"memory")
+#define rmb()	asm volatile("lfence":::"memory")
+#define wmb()	asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * read_barrier_depends - Flush all pending reads that subsequents reads
+ * depend on.
+ *
+ * No data-dependent reads from memory-like regions are ever reordered
+ * over this barrier.  All reads preceding this primitive are guaranteed
+ * to access memory (but not necessarily other CPUs' caches) before any
+ * reads following this primitive that depend on the data return by
+ * any of the preceding reads.  This primitive is much lighter weight than
+ * rmb() on most CPUs, and is never heavier weight than is
+ * rmb().
+ *
+ * These ordering constraints are respected by both the local CPU
+ * and the compiler.
+ *
+ * Ordering is not guaranteed by anything other than these primitives,
+ * not even by data dependencies.  See the documentation for
+ * memory_barrier() for examples and URLs to more information.
+ *
+ * For example, the following code would force ordering (the initial
+ * value of "a" is zero, "b" is one, and "p" is "&a"):
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	b = 2;
+ *	memory_barrier();
+ *	p = &b;				q = p;
+ *					read_barrier_depends();
+ *					d = *q;
+ * </programlisting>
+ *
+ * because the read of "*q" depends on the read of "p" and these
+ * two reads are separated by a read_barrier_depends().  However,
+ * the following code, with the same initial values for "a" and "b":
+ *
+ * <programlisting>
+ *	CPU 0				CPU 1
+ *
+ *	a = 2;
+ *	memory_barrier();
+ *	b = 3;				y = b;
+ *					read_barrier_depends();
+ *					x = a;
+ * </programlisting>
+ *
+ * does not enforce ordering, since there is no data dependency between
+ * the read of "a" and the read of "b".  Therefore, on some CPUs, such
+ * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
+ * in cases like this where there are no data dependencies.
+ **/
+
+#define read_barrier_depends()	do { } while (0)
+
+#ifdef CONFIG_SMP
+#define smp_mb()	mb()
+#ifdef CONFIG_X86_PPRO_FENCE
+# define smp_rmb()	rmb()
+#else
+# define smp_rmb()	barrier()
+#endif
+#ifdef CONFIG_X86_OOSTORE
+# define smp_wmb() 	wmb()
+#else
+# define smp_wmb()	barrier()
+#endif
+#define smp_read_barrier_depends()	read_barrier_depends()
+#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#else
+#define smp_mb()	barrier()
+#define smp_rmb()	barrier()
+#define smp_wmb()	barrier()
+#define smp_read_barrier_depends()	do { } while (0)
+#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#endif
+
+/*
+ * Stop RDTSC speculation. This is needed when you need to use RDTSC
+ * (or get_cycles or vread that possibly accesses the TSC) in a defined
+ * code region.
+ *
+ * (Could use an alternative three way for this if there was one.)
+ */
+static __always_inline void rdtsc_barrier(void)
+{
+	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
+	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+}
+
+/*
+ * We handle most unaligned accesses in hardware.  On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
+ */
+#define NET_IP_ALIGN	0
+#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/mach-xen/asm/time.h b/arch/x86/include/mach-xen/asm/time.h
new file mode 100644
index 0000000..d898756
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/time.h
@@ -0,0 +1,18 @@
+#ifndef _XEN_ASM_TIME_H
+#define _XEN_ASM_TIME_H
+
+unsigned long xen_read_wallclock(void);
+int xen_write_wallclock(unsigned long);
+
+struct timespec;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+int xen_update_wallclock(const struct timespec *);
+#else
+static inline int xen_update_wallclock(const struct timespec *tv) {
+	return -EPERM;
+}
+#endif
+
+#endif /* _XEN_ASM_TIME_H */
+
+#include_next <asm/time.h>
diff --git a/arch/x86/include/mach-xen/asm/tlbflush.h b/arch/x86/include/mach-xen/asm/tlbflush.h
new file mode 100644
index 0000000..0dc6dd6
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/tlbflush.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_TLBFLUSH_H
+#define _ASM_X86_TLBFLUSH_H
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#define __flush_tlb() xen_tlb_flush()
+#define __flush_tlb_global() xen_tlb_flush()
+#define __flush_tlb_single(addr) xen_invlpg(addr)
+#define __flush_tlb_all() xen_tlb_flush()
+#define __flush_tlb_one(addr) xen_invlpg(addr)
+
+#ifdef CONFIG_X86_32
+# define TLB_FLUSH_ALL	0xffffffff
+#else
+# define TLB_FLUSH_ALL	-1ULL
+#endif
+
+/*
+ * TLB flushing:
+ *
+ *  - flush_tlb() flushes the current mm struct TLBs
+ *  - flush_tlb_all() flushes all processes TLBs
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+ *
+ * x86-64 can only flush individual pages or full VMs. For a range flush
+ * we always do the full VM. Might be worth trying if for a small
+ * range a few INVLPGs in a row are a win.
+ */
+
+#ifndef CONFIG_SMP
+
+#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() __flush_tlb_all()
+#define local_flush_tlb() __flush_tlb()
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+	if (mm == current->active_mm)
+		__flush_tlb();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+				  unsigned long addr)
+{
+	if (vma->vm_mm == current->active_mm)
+		__flush_tlb_one(addr);
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end)
+{
+	if (vma->vm_mm == current->active_mm)
+		__flush_tlb();
+}
+
+static inline void reset_lazy_tlbstate(void)
+{
+}
+
+#else  /* SMP */
+
+#include <asm/smp.h>
+
+#define local_flush_tlb() __flush_tlb()
+
+#define flush_tlb_all xen_tlb_flush_all
+#define flush_tlb_current_task() xen_tlb_flush_mask(mm_cpumask(current->mm))
+#define flush_tlb_mm(mm) xen_tlb_flush_mask(mm_cpumask(mm))
+#define flush_tlb_page(vma, va) xen_invlpg_mask(mm_cpumask((vma)->vm_mm), va)
+
+#define flush_tlb()	flush_tlb_current_task()
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end)
+{
+	flush_tlb_mm(vma->vm_mm);
+}
+
+#ifndef CONFIG_XEN
+#define TLBSTATE_OK	1
+#define TLBSTATE_LAZY	2
+
+struct tlb_state {
+	struct mm_struct *active_mm;
+	int state;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+static inline void reset_lazy_tlbstate(void)
+{
+	percpu_write(cpu_tlbstate.state, 0);
+	percpu_write(cpu_tlbstate.active_mm, &init_mm);
+}
+#endif
+
+#endif	/* SMP */
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+					  unsigned long end)
+{
+	flush_tlb_all();
+}
+
+#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/mach-xen/asm/vga.h b/arch/x86/include/mach-xen/asm/vga.h
new file mode 100644
index 0000000..fe4a3c45
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/vga.h
@@ -0,0 +1,20 @@
+/*
+ *	Access to VGA videoram
+ *
+ *	(c) 1998 Martin Mares <mj@ucw.cz>
+ */
+
+#ifndef _ASM_X86_VGA_H
+#define _ASM_X86_VGA_H
+
+/*
+ *	On the PC, we can just recalculate addresses and then
+ *	access the videoram directly without any black magic.
+ */
+
+#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x)
+
+#define vga_readb(x) (*(x))
+#define vga_writeb(x, y) (*(y) = (x))
+
+#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/mach-xen/asm/xenoprof.h b/arch/x86/include/mach-xen/asm/xenoprof.h
new file mode 100644
index 0000000..2733e00
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xenoprof.h
@@ -0,0 +1,48 @@
+/******************************************************************************
+ * asm-i386/mach-xen/asm/xenoprof.h
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#ifndef __ASM_XENOPROF_H__
+#define __ASM_XENOPROF_H__
+#ifdef CONFIG_XEN
+
+struct super_block;
+struct dentry;
+int xenoprof_create_files(struct super_block * sb, struct dentry * root);
+#define HAVE_XENOPROF_CREATE_FILES
+
+struct xenoprof_init;
+void xenoprof_arch_init_counter(struct xenoprof_init *init);
+void xenoprof_arch_counter(void);
+void xenoprof_arch_start(void);
+void xenoprof_arch_stop(void);
+
+struct xenoprof_arch_shared_buffer {
+	/* nothing */
+};
+struct xenoprof_shared_buffer;
+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
+struct xenoprof_get_buffer;
+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
+struct xenoprof_passive;
+int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
+
+#endif /* CONFIG_XEN */
+#endif /* __ASM_XENOPROF_H__ */
diff --git a/arch/x86/include/mach-xen/asm/xor.h b/arch/x86/include/mach-xen/asm/xor.h
new file mode 100644
index 0000000..edb08e6
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xor.h
@@ -0,0 +1,8 @@
+#ifdef CONFIG_KMEMCHECK
+/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
+# include <asm-generic/xor.h>
+#elif defined(CONFIG_X86_32)
+# include "../../asm/xor_32.h"
+#else
+# include "xor_64.h"
+#endif
diff --git a/arch/x86/include/mach-xen/asm/xor_64.h b/arch/x86/include/mach-xen/asm/xor_64.h
new file mode 100644
index 0000000..7d05c24
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xor_64.h
@@ -0,0 +1,339 @@
+#ifndef _ASM_X86_XOR_64_H
+#define _ASM_X86_XOR_64_H
+
+#include <asm/i387.h>
+
+/*
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ *
+ * This hasn't been optimized for the hammer yet, but there are likely
+ * no advantages to be gotten from x86-64 here anyways.
+ */
+
+typedef struct {
+	unsigned long a, b;
+} __attribute__((aligned(16))) xmm_store_t;
+
+/* Doesn't use gcc to save the XMM registers, because there is no easy way to
+   tell it to do a clts before the register saving. */
+#define XMMS_SAVE				\
+do {						\
+	preempt_disable();			\
+	if (!__thread_has_fpu(current))		\
+		clts();				\
+	asm volatile(				\
+		"movups %%xmm0,(%1)	;\n\t"	\
+		"movups %%xmm1,0x10(%1)	;\n\t"	\
+		"movups %%xmm2,0x20(%1)	;\n\t"	\
+		"movups %%xmm3,0x30(%1)	;\n\t"	\
+		: "=&r" (cr0)			\
+		: "r" (xmm_save) 		\
+		: "memory");			\
+} while (0)
+
+#define XMMS_RESTORE				\
+do {						\
+	asm volatile(				\
+		"sfence			;\n\t"	\
+		"movups (%1),%%xmm0	;\n\t"	\
+		"movups 0x10(%1),%%xmm1	;\n\t"	\
+		"movups 0x20(%1),%%xmm2	;\n\t"	\
+		"movups 0x30(%1),%%xmm3	;\n\t"	\
+		:				\
+		: "r" (cr0), "r" (xmm_save)	\
+		: "memory");			\
+	if (!__thread_has_fpu(current))		\
+		stts();				\
+	preempt_enable();			\
+} while (0)
+
+#define OFFS(x)		"16*("#x")"
+#define PF_OFFS(x)	"256+16*("#x")"
+#define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
+#define LD(x, y)	"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
+#define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
+#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
+#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
+#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
+#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
+#define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
+#define XO1(x, y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
+#define XO2(x, y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
+#define XO3(x, y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
+#define XO4(x, y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
+#define XO5(x, y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+	unsigned int lines = bytes >> 8;
+	unsigned long cr0;
+	xmm_store_t xmm_save[4];
+
+	XMMS_SAVE;
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+		PF1(i)					\
+				PF1(i + 2)		\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       addq %[inc], %[p1]           ;\n"
+	"       addq %[inc], %[p2]           ;\n"
+		"		decl %[cnt] ; jnz 1b"
+	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
+	: [inc] "r" (256UL)
+	: "memory");
+
+	XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	  unsigned long *p3)
+{
+	unsigned int lines = bytes >> 8;
+	xmm_store_t xmm_save[4];
+	unsigned long cr0;
+
+	XMMS_SAVE;
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i + 2)		\
+		LD(i, 0)					\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF2(i)					\
+				PF2(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       addq %[inc], %[p1]           ;\n"
+	"       addq %[inc], %[p2]          ;\n"
+	"       addq %[inc], %[p3]           ;\n"
+		"		decl %[cnt] ; jnz 1b"
+	: [cnt] "+r" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+	: [inc] "r" (256UL)
+	: "memory");
+	XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	  unsigned long *p3, unsigned long *p4)
+{
+	unsigned int lines = bytes >> 8;
+	xmm_store_t xmm_save[4];
+	unsigned long cr0;
+
+	XMMS_SAVE;
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF2(i)					\
+				PF2(i + 2)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		PF3(i)					\
+				PF3(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		XO3(i, 0)				\
+			XO3(i + 1, 1)			\
+				XO3(i + 2, 2)		\
+					XO3(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       addq %[inc], %[p1]           ;\n"
+	"       addq %[inc], %[p2]           ;\n"
+	"       addq %[inc], %[p3]           ;\n"
+	"       addq %[inc], %[p4]           ;\n"
+	"	decl %[cnt] ; jnz 1b"
+	: [cnt] "+c" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+	: [inc] "r" (256UL)
+	: "memory" );
+
+	XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+	unsigned int lines = bytes >> 8;
+	xmm_store_t xmm_save[4];
+	unsigned long cr0;
+
+	XMMS_SAVE;
+
+	asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+		PF1(i)					\
+				PF1(i + 2)		\
+		LD(i, 0)				\
+			LD(i + 1, 1)			\
+				LD(i + 2, 2)		\
+					LD(i + 3, 3)	\
+		PF2(i)					\
+				PF2(i + 2)		\
+		XO1(i, 0)				\
+			XO1(i + 1, 1)			\
+				XO1(i + 2, 2)		\
+					XO1(i + 3, 3)	\
+		PF3(i)					\
+				PF3(i + 2)		\
+		XO2(i, 0)				\
+			XO2(i + 1, 1)			\
+				XO2(i + 2, 2)		\
+					XO2(i + 3, 3)	\
+		PF4(i)					\
+				PF4(i + 2)		\
+		PF0(i + 4)				\
+				PF0(i + 6)		\
+		XO3(i, 0)				\
+			XO3(i + 1, 1)			\
+				XO3(i + 2, 2)		\
+					XO3(i + 3, 3)	\
+		XO4(i, 0)				\
+			XO4(i + 1, 1)			\
+				XO4(i + 2, 2)		\
+					XO4(i + 3, 3)	\
+		ST(i, 0)				\
+			ST(i + 1, 1)			\
+				ST(i + 2, 2)		\
+					ST(i + 3, 3)	\
+
+
+		PF0(0)
+				PF0(2)
+
+	" .align 32			;\n"
+	" 1:                            ;\n"
+
+		BLOCK(0)
+		BLOCK(4)
+		BLOCK(8)
+		BLOCK(12)
+
+	"       addq %[inc], %[p1]           ;\n"
+	"       addq %[inc], %[p2]           ;\n"
+	"       addq %[inc], %[p3]           ;\n"
+	"       addq %[inc], %[p4]           ;\n"
+	"       addq %[inc], %[p5]           ;\n"
+	"	decl %[cnt] ; jnz 1b"
+	: [cnt] "+c" (lines),
+	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
+	  [p5] "+r" (p5)
+	: [inc] "r" (256UL)
+	: "memory");
+
+	XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_sse = {
+	.name = "generic_sse",
+	.do_2 = xor_sse_2,
+	.do_3 = xor_sse_3,
+	.do_4 = xor_sse_4,
+	.do_5 = xor_sse_5,
+};
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES			\
+do {						\
+	xor_speed(&xor_block_sse);		\
+} while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
+#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+
+#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5369059..dadd24e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -101,6 +101,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 obj-$(CONFIG_OF)			+= devicetree.o
 
+obj-$(CONFIG_X86_XEN)		+= fixup.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
@@ -112,3 +114,8 @@ ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o
 endif
+
+disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8237.o i8253.o \
+	i8259.o irqinit.o pci-swiotlb.o reboot.o smpboot.o trampoline%.o \
+	tsc.o tsc_sync.o vsmp_64.o
+disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 6f35260..528e3de 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -5,6 +5,9 @@ obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup_rm.o wakeup_$(BITS).o
 
 ifneq ($(CONFIG_ACPI_PROCESSOR),)
 obj-y				+= cstate.o
+ifneq ($(CONFIG_PROCESSOR_EXTERNAL_CONTROL),)
+obj-$(CONFIG_XEN)		+= processor_extcntl_xen.o
+endif
 endif
 
 $(obj)/wakeup_rm.o:    $(obj)/realmode/wakeup.bin
@@ -12,3 +15,4 @@ $(obj)/wakeup_rm.o:    $(obj)/realmode/wakeup.bin
 $(obj)/realmode/wakeup.bin: FORCE
 	$(Q)$(MAKE) $(build)=$(obj)/realmode
 
+disabled-obj-$(CONFIG_XEN)	:= cstate.o sleep.o wakeup_%.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 1b9b052..84cfb0d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -70,6 +70,7 @@ int acpi_strict;
 
 u8 acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
+#ifndef CONFIG_XEN
 int acpi_skip_timer_override __initdata;
 int acpi_use_timer_override __initdata;
 int acpi_fix_pin2_polarity __initdata;
@@ -77,6 +78,10 @@ int acpi_fix_pin2_polarity __initdata;
 #ifdef CONFIG_X86_LOCAL_APIC
 static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 #endif
+#else
+#define acpi_skip_timer_override 0
+#define acpi_fix_pin2_polarity 0
+#endif
 
 #ifndef __HAVE_ARCH_CMPXCHG
 #warning ACPI uses CMPXCHG, i486 and later hardware
@@ -182,6 +187,7 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
 		return -ENODEV;
 	}
 
+#ifndef CONFIG_XEN
 	if (madt->address) {
 		acpi_lapic_addr = (u64) madt->address;
 
@@ -191,12 +197,14 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
 
 	default_acpi_madt_oem_check(madt->header.oem_id,
 				    madt->header.oem_table_id);
+#endif
 
 	return 0;
 }
 
 static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 {
+#ifndef CONFIG_XEN
 	unsigned int ver = 0;
 
 	if (id >= (MAX_LOCAL_APIC-1)) {
@@ -213,6 +221,7 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 		ver = apic_version[boot_cpu_physical_apicid];
 
 	generic_processor_info(id, ver);
+#endif
 }
 
 static int __init
@@ -297,6 +306,7 @@ static int __init
 acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 			  const unsigned long end)
 {
+#ifndef CONFIG_XEN
 	struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
 
 	lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
@@ -305,6 +315,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
 		return -EINVAL;
 
 	acpi_lapic_addr = lapic_addr_ovr->address;
+#endif
 
 	return 0;
 }
@@ -593,6 +604,7 @@ void __init acpi_set_irq_model_ioapic(void)
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 #include <acpi/processor.h>
 
+#ifndef CONFIG_XEN
 static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 {
 #ifdef CONFIG_ACPI_NUMA
@@ -678,6 +690,9 @@ free_tmp_map:
 out:
 	return retval;
 }
+#else
+#define _acpi_map_lsapic(h, p) (-EINVAL)
+#endif
 
 /* wrapper to silence section mismatch warning */
 int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
@@ -688,9 +703,11 @@ EXPORT_SYMBOL(acpi_map_lsapic);
 
 int acpi_unmap_lsapic(int cpu)
 {
+#ifndef CONFIG_XEN
 	per_cpu(x86_cpu_to_apicid, cpu) = -1;
 	set_cpu_present(cpu, false);
 	num_processors--;
+#endif
 
 	return (0);
 }
@@ -1332,6 +1349,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
 	return 0;
 }
 
+#ifndef CONFIG_XEN
 /*
  * Force ignoring BIOS IRQ0 pin2 override
  */
@@ -1349,6 +1367,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 	}
 	return 0;
 }
+#endif
 
 static int __init force_acpi_rsdt(const struct dmi_system_id *d)
 {
@@ -1469,6 +1488,7 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 	{}
 };
 
+#ifndef CONFIG_XEN
 /* second table for DMI checks that should run after early-quirks */
 static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
 	/*
@@ -1515,6 +1535,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
 	 },
 	{}
 };
+#endif
 
 /*
  * acpi_boot_table_init() and acpi_boot_init()
@@ -1587,8 +1608,10 @@ int __init early_acpi_boot_init(void)
 
 int __init acpi_boot_init(void)
 {
+#ifndef CONFIG_XEN
 	/* those are executed after early-quirks are executed */
 	dmi_check_system(acpi_dmi_table_late);
+#endif
 
 	/*
 	 * If acpi_disabled, bail out
@@ -1688,7 +1711,7 @@ int __init acpi_mps_check(void)
 	return 0;
 }
 
-#ifdef CONFIG_X86_IO_APIC
+#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
 static int __init parse_acpi_skip_timer_override(char *arg)
 {
 	acpi_skip_timer_override = 1;
diff --git a/arch/x86/kernel/acpi/processor_extcntl_xen.c b/arch/x86/kernel/acpi/processor_extcntl_xen.c
new file mode 100644
index 0000000..6293fac
--- /dev/null
+++ b/arch/x86/kernel/acpi/processor_extcntl_xen.c
@@ -0,0 +1,287 @@
+/*
+ * processor_extcntl_xen.c - interface to notify Xen
+ *
+ *  Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/cpufreq.h>
+#include <acpi/processor.h>
+#include <asm/hypercall.h>
+
+static int xen_cx_notifier(struct acpi_processor *pr, int action)
+{
+	int ret, count = 0, i;
+	xen_platform_op_t op = {
+		.cmd			= XENPF_set_processor_pminfo,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.set_pminfo.id	= pr->acpi_id,
+		.u.set_pminfo.type	= XEN_PM_CX,
+	};
+	struct xen_processor_cx *data, *buf;
+	struct acpi_processor_cx *cx;
+
+	/* Convert to Xen defined structure and hypercall */
+	buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
+			GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	data = buf;
+	for (i = 1; i <= pr->power.count; i++) {
+		cx = &pr->power.states[i];
+		/* Skip invalid cstate entry */
+		if (!cx->valid)
+			continue;
+
+		data->type = cx->type;
+		data->latency = cx->latency;
+		data->power = cx->power;
+		data->reg.space_id = cx->reg.space_id;
+		data->reg.bit_width = cx->reg.bit_width;
+		data->reg.bit_offset = cx->reg.bit_offset;
+		data->reg.access_size = cx->reg.access_size;
+		data->reg.address = cx->reg.address;
+
+		/* Get dependency relationships */
+		if (cx->csd_count) {
+			pr_warning("_CSD found: Not supported for now!\n");
+			kfree(buf);
+			return -EINVAL;
+		} else {
+			data->dpcnt = 0;
+			set_xen_guest_handle(data->dp, NULL);
+		}
+
+		data++;
+		count++;
+	}
+
+	if (!count) {
+		pr_info("No available Cx info for cpu %d\n", pr->acpi_id);
+		kfree(buf);
+		return -EINVAL;
+	}
+
+	op.u.set_pminfo.u.power.count = count;
+	op.u.set_pminfo.u.power.flags.bm_control = pr->flags.bm_control;
+	op.u.set_pminfo.u.power.flags.bm_check = pr->flags.bm_check;
+	op.u.set_pminfo.u.power.flags.has_cst = pr->flags.has_cst;
+	op.u.set_pminfo.u.power.flags.power_setup_done = pr->flags.power_setup_done;
+
+	set_xen_guest_handle(op.u.set_pminfo.u.power.states, buf);
+	ret = HYPERVISOR_platform_op(&op);
+	kfree(buf);
+	return ret;
+}
+
+static int xen_px_notifier(struct acpi_processor *pr, int action)
+{
+	int ret = -EINVAL;
+	xen_platform_op_t op = {
+		.cmd			= XENPF_set_processor_pminfo,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.set_pminfo.id	= pr->acpi_id,
+		.u.set_pminfo.type	= XEN_PM_PX,
+	};
+	struct xen_processor_performance *perf;
+	struct xen_processor_px *states = NULL;
+	struct acpi_processor_performance *px;
+	struct acpi_psd_package *pdomain;
+
+	if (!pr)
+		return -EINVAL;
+
+	perf = &op.u.set_pminfo.u.perf;
+	px = pr->performance;
+	if (!px)
+		return -EINVAL;
+
+	switch(action) {
+	case PROCESSOR_PM_CHANGE:
+		/* ppc dynamic handle */
+		perf->flags = XEN_PX_PPC;
+		perf->platform_limit = pr->performance_platform_limit;
+
+		ret = HYPERVISOR_platform_op(&op);
+		break;
+
+	case PROCESSOR_PM_INIT:
+		/* px normal init */
+		perf->flags = XEN_PX_PPC | 
+			      XEN_PX_PCT | 
+			      XEN_PX_PSS | 
+			      XEN_PX_PSD;
+
+		/* ppc */
+		perf->platform_limit = pr->performance_platform_limit;
+
+		/* pct */
+		xen_convert_pct_reg(&perf->control_register, &px->control_register);
+		xen_convert_pct_reg(&perf->status_register, &px->status_register);
+
+		/* pss */
+		perf->state_count = px->state_count;
+		states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL);
+		if (!states)
+			return -ENOMEM;
+		xen_convert_pss_states(states, px->states, px->state_count);
+		set_xen_guest_handle(perf->states, states);
+
+		/* psd */
+		pdomain = &px->domain_info;
+		xen_convert_psd_pack(&perf->domain_info, pdomain);
+		if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
+			perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
+			perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
+			perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
+		else {
+			ret = -ENODEV;
+			kfree(states);
+			break;
+		}
+
+		ret = HYPERVISOR_platform_op(&op);
+		kfree(states);
+		break;
+
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static int xen_tx_notifier(struct acpi_processor *pr, int action)
+{
+	return -EINVAL;
+}
+
+static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
+{
+	int ret = -EINVAL;
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+	acpi_status status = 0;
+	acpi_object_type type;
+	uint32_t apic_id;
+	int device_decl = 0;
+	unsigned long long pxm;
+	xen_platform_op_t op;
+
+	status = acpi_get_type(pr->handle, &type);
+	if (ACPI_FAILURE(status)) {
+		pr_warn("can't get object type for acpi_id %#x\n",
+			pr->acpi_id);
+		return -ENXIO;
+	}
+
+	switch (type) {
+	case ACPI_TYPE_PROCESSOR:
+		break;
+	case ACPI_TYPE_DEVICE:
+		device_decl = 1;
+		break;
+	default:
+		pr_warn("unsupported object type %#x for acpi_id %#x\n",
+			type, pr->acpi_id);
+		return -EOPNOTSUPP;
+	}
+
+	apic_id = acpi_get_cpuid(pr->handle, ~device_decl, pr->acpi_id);
+	if (apic_id < 0) {
+		pr_warn("can't get apic_id for acpi_id %#x\n", pr->acpi_id);
+		return -ENODATA;
+	}
+
+	status = acpi_evaluate_integer(pr->handle, "_PXM", NULL, &pxm);
+	if (ACPI_FAILURE(status)) {
+		pr_warn("can't get pxm for acpi_id %#x\n", pr->acpi_id);
+		return -ENODATA;
+	}
+
+	switch (event) {
+	case HOTPLUG_TYPE_ADD:
+		op.cmd = XENPF_cpu_hotadd;
+		op.u.cpu_add.apic_id = apic_id;
+		op.u.cpu_add.acpi_id = pr->acpi_id;
+		op.u.cpu_add.pxm = pxm;
+		ret = HYPERVISOR_platform_op(&op);
+		break;
+	case HOTPLUG_TYPE_REMOVE:
+		pr_warn("Xen doesn't support CPU hot remove\n");
+		ret = -EOPNOTSUPP;
+		break;
+	}
+#endif
+
+	return ret;
+}
+
+static struct processor_extcntl_ops xen_extcntl_ops = {
+	.hotplug		= xen_hotplug_notifier,
+};
+
+static int __init init_extcntl(void)
+{
+	unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
+
+#ifndef CONFIG_ACPI_HOTPLUG_CPU
+	if (!pmbits)
+		return 0;
+#endif
+	if (pmbits & XEN_PROCESSOR_PM_CX)
+		xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
+	if (pmbits & XEN_PROCESSOR_PM_PX)
+		xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
+	if (pmbits & XEN_PROCESSOR_PM_TX)
+		xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
+
+	processor_extcntl_ops = &xen_extcntl_ops;
+
+	return 0;
+}
+arch_initcall(init_extcntl);
+
+unsigned int cpufreq_quick_get(unsigned int cpu)
+{
+	xen_platform_op_t op;
+
+	op.cmd = XENPF_get_cpu_freq;
+	op.u.get_cpu_freq.vcpu = cpu;
+	return HYPERVISOR_platform_op(&op) == 0 ? op.u.get_cpu_freq.freq : 0;
+}
+
+unsigned int cpufreq_quick_get_max(unsigned int cpu)
+{
+	xen_platform_op_t op;
+
+	op.cmd = XENPF_get_cpu_freq_max;
+	op.u.get_cpu_freq.vcpu = cpu;
+	return HYPERVISOR_platform_op(&op) == 0 ? op.u.get_cpu_freq.freq : 0;
+}
+EXPORT_SYMBOL(cpufreq_quick_get_max);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index be16854..1bd8529 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -15,6 +15,10 @@ static u32 *flush_words;
 const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+#ifdef CONFIG_XEN
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, /* Fam12, Fam14 */
+#endif
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
 	{}
 };
@@ -150,6 +154,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
 	return res;
 }
 
+#ifndef CONFIG_XEN
 int amd_get_subcaches(int cpu)
 {
 	struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
@@ -204,6 +209,7 @@ int amd_set_subcaches(int cpu, int mask)
 
 	return 0;
 }
+#endif
 
 static int amd_cache_gart(void)
 {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 0ae0323..f30b902 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -25,3 +25,7 @@ obj-$(CONFIG_X86_ES7000)	+= es7000_32.o
 
 # For 32bit, probe_32 need to be listed last
 obj-$(CONFIG_X86_LOCAL_APIC)	+= probe_$(BITS).o
+
+probe_64-$(CONFIG_XEN)		:= probe_32.o
+
+disabled-obj-$(CONFIG_XEN)	:= apic_%.o
diff --git a/arch/x86/kernel/apic/apic-xen.c b/arch/x86/kernel/apic/apic-xen.c
new file mode 100644
index 0000000..6b0603c
--- /dev/null
+++ b/arch/x86/kernel/apic/apic-xen.c
@@ -0,0 +1,69 @@
+/*
+ *	Local APIC handling stubs
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+
+#include <asm/smp.h>
+#include <asm/proto.h>
+#include <asm/apic.h>
+
+unsigned int num_processors;
+
+/*
+ * Debug level, exported for io_apic.c
+ */
+unsigned int apic_verbosity;
+
+/* Have we found an MP table */
+int smp_found_config;
+
+static int __init apic_set_verbosity(char *arg)
+{
+	if (!arg)  {
+#ifdef CONFIG_X86_64
+		skip_ioapic_setup = 0;
+		return 0;
+#endif
+		return -EINVAL;
+	}
+
+	if (strcmp("debug", arg) == 0)
+		apic_verbosity = APIC_DEBUG;
+	else if (strcmp("verbose", arg) == 0)
+		apic_verbosity = APIC_VERBOSE;
+	else {
+		pr_warning("APIC Verbosity level %s not recognised"
+			" use apic=verbose or apic=debug\n", arg);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+early_param("apic", apic_set_verbosity);
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+	return -EINVAL;
+}
+
+#ifndef CONFIG_SMP
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+		setup_IO_APIC();
+# ifdef CONFIG_X86_64
+	else
+		nr_ioapics = 0;
+# endif
+#endif
+
+	return 0;
+}
+#endif
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 31cb9ae..8773f2c 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -26,6 +26,10 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
 #endif
 
 #ifdef arch_trigger_all_cpu_backtrace
+#ifdef CONFIG_XEN
+#include <asm/ipi.h>
+#endif
+
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
 
@@ -46,7 +50,11 @@ void arch_trigger_all_cpu_backtrace(void)
 	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
 
 	printk(KERN_INFO "sending NMI to all CPUs:\n");
+#ifndef CONFIG_XEN
 	apic->send_IPI_all(NMI_VECTOR);
+#else /* this works even without CONFIG_X86_LOCAL_APIC */
+	xen_send_IPI_all(NMI_VECTOR);
+#endif
 
 	/* Wait for up to 10 seconds for all CPUs to do the backtrace */
 	for (i = 0; i < 10 * 1000; i++) {
diff --git a/arch/x86/kernel/apic/io_apic-xen.c b/arch/x86/kernel/apic/io_apic-xen.c
new file mode 100644
index 0000000..2784be7
--- /dev/null
+++ b/arch/x86/kernel/apic/io_apic-xen.c
@@ -0,0 +1,4199 @@
+/*
+ *	Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *	Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ *
+ *	Many thanks to Stig Venaas for trying out countless experimental
+ *	patches and reporting/debugging problems patiently!
+ *
+ *	(c) 1999, Multiple IO-APIC support, developed by
+ *	Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ *	further tested and cleaned up by Zach Brown <zab@redhat.com>
+ *	and Ingo Molnar <mingo@redhat.com>
+ *
+ *	Fixes
+ *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs;
+ *					thanks to Eric Gilmore
+ *					and Rolf G. Tews
+ *					for testing these extensively
+ *	Paul Diefenbaugh	:	Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/syscore_ops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h>	/* time_after() */
+#include <linux/slab.h>
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+#include <linux/bootmem.h>
+
+#include <asm/idle.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/dma.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/setup.h>
+#include <asm/hw_irq.h>
+
+#include <asm/apic.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/evtchn.h>
+
+/* Fake i8259 */
+static void make_8259A_irq(unsigned int irq) { io_apic_irqs &= ~(1UL<<irq); }
+static const struct legacy_pic xen_legacy_pic = {
+	.nr_legacy_irqs = NR_IRQS_LEGACY,
+	.make_irq = make_8259A_irq
+};
+#define legacy_pic (&xen_legacy_pic)
+
+unsigned long io_apic_irqs;
+#endif /* CONFIG_XEN */
+
+#define __apicdebuginit(type) static type __init
+#define for_each_irq_pin(entry, head) \
+	for (entry = head; entry; entry = entry->next)
+
+/*
+ *      Is the SiS APIC rmw bug present ?
+ *      -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+#ifndef CONFIG_XEN
+static DEFINE_RAW_SPINLOCK(vector_lock);
+#endif
+
+static struct ioapic {
+	/*
+	 * # of IRQ routing registers
+	 */
+	int nr_registers;
+#ifndef CONFIG_XEN
+	/*
+	 * Saved state during suspend/resume, or while enabling intr-remap.
+	 */
+	struct IO_APIC_route_entry *saved_registers;
+#endif
+	/* I/O APIC config */
+	struct mpc_ioapic mp_config;
+	/* IO APIC gsi routing info */
+	struct mp_ioapic_gsi  gsi_config;
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} ioapics[MAX_IO_APICS];
+
+#define mpc_ioapic_ver(ioapic_idx)	ioapics[ioapic_idx].mp_config.apicver
+
+int mpc_ioapic_id(int ioapic_idx)
+{
+	return ioapics[ioapic_idx].mp_config.apicid;
+}
+
+unsigned int mpc_ioapic_addr(int ioapic_idx)
+{
+	return ioapics[ioapic_idx].mp_config.apicaddr;
+}
+
+struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+{
+	return &ioapics[ioapic_idx].gsi_config;
+}
+
+int nr_ioapics;
+
+/* The one past the highest gsi number used */
+u32 gsi_top;
+
+/* MP IRQ source entries */
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
+#ifndef CONFIG_XEN
+/* GSI interrupts */
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
+#endif
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+int skip_ioapic_setup;
+
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+static void __init _disable_ioapic_support(void)
+{
+#ifdef CONFIG_PCI
+	noioapicquirk = 1;
+	noioapicreroute = -1;
+#endif
+	skip_ioapic_setup = 1;
+}
+
+static int __init parse_noapic(char *str)
+{
+	/* disable IO-APIC */
+	_disable_ioapic_support();
+	return 0;
+}
+early_param("noapic", parse_noapic);
+
+static int io_apic_setup_irq_pin(unsigned int irq, int node,
+				 struct io_apic_irq_attr *attr);
+
+/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
+{
+	int i;
+
+	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+		m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+		m->srcbusirq, m->dstapic, m->dstirq);
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
+			return;
+	}
+
+	memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
+}
+
+#ifndef CONFIG_XEN
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *alloc_irq_pin_list(int node)
+{
+	return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
+}
+
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
+
+int __init arch_early_irq_init(void)
+{
+	struct irq_cfg *cfg;
+	int count, node, i;
+
+	if (!legacy_pic->nr_legacy_irqs)
+		io_apic_irqs = ~0UL;
+
+	for (i = 0; i < nr_ioapics; i++) {
+		ioapics[i].saved_registers =
+			kzalloc(sizeof(struct IO_APIC_route_entry) *
+				ioapics[i].nr_registers, GFP_KERNEL);
+		if (!ioapics[i].saved_registers)
+			pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
+	}
+
+	cfg = irq_cfgx;
+	count = ARRAY_SIZE(irq_cfgx);
+	node = cpu_to_node(0);
+
+	/* Make sure the legacy interrupts are marked in the bitmap */
+	irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
+
+	for (i = 0; i < count; i++) {
+		irq_set_chip_data(i, &cfg[i]);
+		zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
+		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
+		/*
+		 * For legacy IRQ's, start with assigning irq0 to irq15 to
+		 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
+		 */
+		if (i < legacy_pic->nr_legacy_irqs) {
+			cfg[i].vector = IRQ0_VECTOR + i;
+			cpumask_set_cpu(0, cfg[i].domain);
+		}
+	}
+
+	return 0;
+}
+
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq_get_chip_data(irq);
+}
+
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+{
+	struct irq_cfg *cfg;
+
+	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+	if (!cfg)
+		return NULL;
+	if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
+		goto out_cfg;
+	if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+		goto out_domain;
+	return cfg;
+out_domain:
+	free_cpumask_var(cfg->domain);
+out_cfg:
+	kfree(cfg);
+	return NULL;
+}
+
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
+{
+	if (!cfg)
+		return;
+	irq_set_chip_data(at, NULL);
+	free_cpumask_var(cfg->domain);
+	free_cpumask_var(cfg->old_domain);
+	kfree(cfg);
+}
+
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+{
+	int res = irq_alloc_desc_at(at, node);
+	struct irq_cfg *cfg;
+
+	if (res < 0) {
+		if (res != -EEXIST)
+			return NULL;
+		cfg = irq_get_chip_data(at);
+		if (cfg)
+			return cfg;
+	}
+
+	cfg = alloc_irq_cfg(at, node);
+	if (cfg)
+		irq_set_chip_data(at, cfg);
+	else
+		irq_free_desc(at);
+	return cfg;
+}
+
+static int alloc_irq_from(unsigned int from, int node)
+{
+	return irq_alloc_desc_from(from, node);
+}
+
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
+{
+	free_irq_cfg(at, cfg);
+	irq_free_desc(at);
+}
+
+struct io_apic {
+	unsigned int index;
+	unsigned int unused[3];
+	unsigned int data;
+	unsigned int unused2[11];
+	unsigned int eoi;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+		+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
+}
+
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(vector, &io_apic->eoi);
+}
+#endif /* !CONFIG_XEN */
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+#ifndef CONFIG_XEN
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	return readl(&io_apic->data);
+#else
+	struct physdev_apic apic_op;
+	int ret;
+
+	apic_op.apic_physbase = mpc_ioapic_addr(apic);
+	apic_op.reg = reg;
+	ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+	if (ret)
+		return ret;
+	return apic_op.value;
+#endif
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+#ifndef CONFIG_XEN
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+	writel(reg, &io_apic->index);
+	writel(value, &io_apic->data);
+#else
+	struct physdev_apic apic_op;
+
+	apic_op.apic_physbase = mpc_ioapic_addr(apic);
+	apic_op.reg = reg;
+	apic_op.value = value;
+	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
+#endif
+}
+
+#ifdef CONFIG_XEN
+#define io_apic_modify io_apic_write
+#else
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
+ */
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+	struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+	if (sis_apic_bug)
+		writel(reg, &io_apic->index);
+	writel(value, &io_apic->data);
+}
+
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	for_each_irq_pin(entry, cfg->irq_2_pin) {
+		unsigned int reg;
+		int pin;
+
+		pin = entry->pin;
+		reg = io_apic_read(entry->apic, 0x10 + pin*2);
+		/* Is the remote IRR bit set? */
+		if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+			raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+			return true;
+		}
+	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return false;
+}
+#endif /* CONFIG_XEN */
+
+union entry_union {
+	struct { u32 w1, w2; };
+	struct IO_APIC_route_entry entry;
+};
+
+#ifndef CONFIG_XEN
+static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
+{
+	union entry_union eu;
+
+	eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+	eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+	return eu.entry;
+}
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+	union entry_union eu;
+	unsigned long flags;
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	eu.entry = __ioapic_read_entry(apic, pin);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+	return eu.entry;
+}
+#endif
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	union entry_union eu = {{0, 0}};
+
+	eu.entry = e;
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+	unsigned long flags;
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	__ioapic_write_entry(apic, pin, e);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+	unsigned long flags;
+	union entry_union eu = { .entry.mask = 1 };
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static int
+__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+{
+	struct irq_pin_list **last, *entry;
+
+	/* don't allow duplicates */
+	last = &cfg->irq_2_pin;
+	for_each_irq_pin(entry, cfg->irq_2_pin) {
+		if (entry->apic == apic && entry->pin == pin)
+			return 0;
+		last = &entry->next;
+	}
+
+	entry = alloc_irq_pin_list(node);
+	if (!entry) {
+		printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
+				node, apic, pin);
+		return -ENOMEM;
+	}
+	entry->apic = apic;
+	entry->pin = pin;
+
+	*last = entry;
+	return 0;
+}
+
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+{
+	if (__add_pin_to_irq_node(cfg, node, apic, pin))
+		panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
+}
+
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+					   int oldapic, int oldpin,
+					   int newapic, int newpin)
+{
+	struct irq_pin_list *entry;
+
+	for_each_irq_pin(entry, cfg->irq_2_pin) {
+		if (entry->apic == oldapic && entry->pin == oldpin) {
+			entry->apic = newapic;
+			entry->pin = newpin;
+			/* every one is different, right? */
+			return;
+		}
+	}
+
+	/* old apic/pin didn't exist, so just add new ones */
+	add_pin_to_irq_node(cfg, node, newapic, newpin);
+}
+
+static void __io_apic_modify_irq(struct irq_pin_list *entry,
+				 int mask_and, int mask_or,
+				 void (*final)(struct irq_pin_list *entry))
+{
+	unsigned int reg, pin;
+
+	pin = entry->pin;
+	reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+	reg &= mask_and;
+	reg |= mask_or;
+	io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+	if (final)
+		final(entry);
+}
+
+static void io_apic_modify_irq(struct irq_cfg *cfg,
+			       int mask_and, int mask_or,
+			       void (*final)(struct irq_pin_list *entry))
+{
+	struct irq_pin_list *entry;
+
+	for_each_irq_pin(entry, cfg->irq_2_pin)
+		__io_apic_modify_irq(entry, mask_and, mask_or, final);
+}
+
+static void io_apic_sync(struct irq_pin_list *entry)
+{
+	/*
+	 * Synchronize the IO-APIC and the CPU by doing
+	 * a dummy read from the IO-APIC
+	 */
+	struct io_apic __iomem *io_apic;
+	io_apic = io_apic_base(entry->apic);
+	readl(&io_apic->data);
+}
+
+static void mask_ioapic(struct irq_cfg *cfg)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void mask_ioapic_irq(struct irq_data *data)
+{
+	mask_ioapic(data->chip_data);
+}
+
+static void __unmask_ioapic(struct irq_cfg *cfg)
+{
+	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
+
+static void unmask_ioapic(struct irq_cfg *cfg)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	__unmask_ioapic(cfg);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_ioapic_irq(struct irq_data *data)
+{
+	unmask_ioapic(data->chip_data);
+}
+
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ *     0Xh     82489DX
+ *     1Xh     I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ *     2Xh     I/O(x)APIC which is PCI 2.2 Compliant
+ *     30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+ */
+static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg)
+{
+	if (mpc_ioapic_ver(apic) >= 0x20) {
+		/*
+		 * Intr-remapping uses pin number as the virtual vector
+		 * in the RTE. Actual vector is programmed in
+		 * intr-remapping table entry. Hence for the io-apic
+		 * EOI we use the pin number.
+		 */
+		if (cfg && irq_remapped(cfg))
+			io_apic_eoi(apic, pin);
+		else
+			io_apic_eoi(apic, vector);
+	} else {
+		struct IO_APIC_route_entry entry, entry1;
+
+		entry = entry1 = __ioapic_read_entry(apic, pin);
+
+		/*
+		 * Mask the entry and change the trigger mode to edge.
+		 */
+		entry1.mask = 1;
+		entry1.trigger = IOAPIC_EDGE;
+
+		__ioapic_write_entry(apic, pin, entry1);
+
+		/*
+		 * Restore the previous level triggered entry.
+		 */
+		__ioapic_write_entry(apic, pin, entry);
+	}
+}
+
+static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	for_each_irq_pin(entry, cfg->irq_2_pin)
+		__eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+	struct IO_APIC_route_entry entry;
+
+	/* Check delivery_mode to be sure we're not clearing an SMI pin */
+	entry = ioapic_read_entry(apic, pin);
+	if (entry.delivery_mode == dest_SMI)
+		return;
+
+	/*
+	 * Make sure the entry is masked and re-read the contents to check
+	 * if it is a level triggered pin and if the remote-IRR is set.
+	 */
+	if (!entry.mask) {
+		entry.mask = 1;
+		ioapic_write_entry(apic, pin, entry);
+		entry = ioapic_read_entry(apic, pin);
+	}
+
+	if (entry.irr) {
+		unsigned long flags;
+
+		/*
+		 * Make sure the trigger mode is set to level. Explicit EOI
+		 * doesn't clear the remote-IRR if the trigger mode is not
+		 * set to level.
+		 */
+		if (!entry.trigger) {
+			entry.trigger = IOAPIC_LEVEL;
+			ioapic_write_entry(apic, pin, entry);
+		}
+
+		raw_spin_lock_irqsave(&ioapic_lock, flags);
+		__eoi_ioapic_pin(apic, pin, entry.vector, NULL);
+		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+	}
+
+	/*
+	 * Clear the rest of the bits in the IO-APIC RTE except for the mask
+	 * bit.
+	 */
+	ioapic_mask_entry(apic, pin);
+	entry = ioapic_read_entry(apic, pin);
+	if (entry.irr)
+		printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n",
+		       mpc_ioapic_id(apic), pin);
+}
+
+static void clear_IO_APIC (void)
+{
+	int apic, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++)
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+			clear_IO_APIC_pin(apic, pin);
+}
+#else
+#define add_pin_to_irq_node(cfg, node, apic, pin)
+#define __add_pin_to_irq_node(cfg, node, apic, pin) 0
+#endif /* !CONFIG_XEN */
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries[MAX_PIRQS] = {
+	[0 ... MAX_PIRQS - 1] = -1
+};
+
+static int __init ioapic_pirq_setup(char *str)
+{
+	int i, max;
+	int ints[MAX_PIRQS+1];
+
+	get_options(str, ARRAY_SIZE(ints), ints);
+
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"PIRQ redirection, working around broken MP-BIOS.\n");
+	max = MAX_PIRQS;
+	if (ints[0] < MAX_PIRQS)
+		max = ints[0];
+
+	for (i = 0; i < max; i++) {
+		apic_printk(APIC_VERBOSE, KERN_DEBUG
+				"... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+		/*
+		 * PIRQs are mapped upside down, usually.
+		 */
+		pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+	}
+	return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+#ifndef CONFIG_XEN
+/*
+ * Saves all the IO-APIC RTE's
+ */
+int save_ioapic_entries(void)
+{
+	int apic, pin;
+	int err = 0;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		if (!ioapics[apic].saved_registers) {
+			err = -ENOMEM;
+			continue;
+		}
+
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+			ioapics[apic].saved_registers[pin] =
+				ioapic_read_entry(apic, pin);
+	}
+
+	return err;
+}
+
+/*
+ * Mask all IO APIC entries.
+ */
+void mask_ioapic_entries(void)
+{
+	int apic, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		if (!ioapics[apic].saved_registers)
+			continue;
+
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+			struct IO_APIC_route_entry entry;
+
+			entry = ioapics[apic].saved_registers[pin];
+			if (!entry.mask) {
+				entry.mask = 1;
+				ioapic_write_entry(apic, pin, entry);
+			}
+		}
+	}
+}
+
+/*
+ * Restore IO APIC entries which was saved in the ioapic structure.
+ */
+int restore_ioapic_entries(void)
+{
+	int apic, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		if (!ioapics[apic].saved_registers)
+			continue;
+
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+			ioapic_write_entry(apic, pin,
+					   ioapics[apic].saved_registers[pin]);
+	}
+	return 0;
+}
+#endif /* CONFIG_XEN */
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int ioapic_idx, int pin, int type)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++)
+		if (mp_irqs[i].irqtype == type &&
+		    (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
+		     mp_irqs[i].dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].dstirq == pin)
+			return i;
+
+	return -1;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].srcbus;
+
+		if (test_bit(lbus, mp_bus_not_pci) &&
+		    (mp_irqs[i].irqtype == type) &&
+		    (mp_irqs[i].srcbusirq == irq))
+
+			return mp_irqs[i].dstirq;
+	}
+	return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].srcbus;
+
+		if (test_bit(lbus, mp_bus_not_pci) &&
+		    (mp_irqs[i].irqtype == type) &&
+		    (mp_irqs[i].srcbusirq == irq))
+			break;
+	}
+
+	if (i < mp_irq_entries) {
+		int ioapic_idx;
+
+		for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+			if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
+				return ioapic_idx;
+	}
+
+	return -1;
+}
+#endif
+
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+	if (irq < legacy_pic->nr_legacy_irqs) {
+		unsigned int port = 0x4d0 + (irq >> 3);
+		return (inb(port) >> (irq & 7)) & 1;
+	}
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"Broken MPtable reports ISA irq %d\n", irq);
+	return 0;
+}
+
+#endif
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx)	(0)
+#define default_ISA_polarity(idx)	(0)
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
+#define default_EISA_polarity(idx)	default_ISA_polarity(idx)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx)	(1)
+#define default_PCI_polarity(idx)	(1)
+
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx)	(1)
+#define default_MCA_polarity(idx)	default_ISA_polarity(idx)
+
+static int irq_polarity(int idx)
+{
+	int bus = mp_irqs[idx].srcbus;
+	int polarity;
+
+	/*
+	 * Determine IRQ line polarity (high active or low active):
+	 */
+	switch (mp_irqs[idx].irqflag & 3)
+	{
+		case 0: /* conforms, ie. bus-type dependent polarity */
+			if (test_bit(bus, mp_bus_not_pci))
+				polarity = default_ISA_polarity(idx);
+			else
+				polarity = default_PCI_polarity(idx);
+			break;
+		case 1: /* high active */
+		{
+			polarity = 0;
+			break;
+		}
+		case 2: /* reserved */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			polarity = 1;
+			break;
+		}
+		case 3: /* low active */
+		{
+			polarity = 1;
+			break;
+		}
+		default: /* invalid */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			polarity = 1;
+			break;
+		}
+	}
+	return polarity;
+}
+
+static int irq_trigger(int idx)
+{
+	int bus = mp_irqs[idx].srcbus;
+	int trigger;
+
+	/*
+	 * Determine IRQ trigger mode (edge or level sensitive):
+	 */
+	switch ((mp_irqs[idx].irqflag>>2) & 3)
+	{
+		case 0: /* conforms, ie. bus-type dependent */
+			if (test_bit(bus, mp_bus_not_pci))
+				trigger = default_ISA_trigger(idx);
+			else
+				trigger = default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+			switch (mp_bus_id_to_type[bus]) {
+				case MP_BUS_ISA: /* ISA pin */
+				{
+					/* set before the switch */
+					break;
+				}
+				case MP_BUS_EISA: /* EISA pin */
+				{
+					trigger = default_EISA_trigger(idx);
+					break;
+				}
+				case MP_BUS_PCI: /* PCI pin */
+				{
+					/* set before the switch */
+					break;
+				}
+				case MP_BUS_MCA: /* MCA pin */
+				{
+					trigger = default_MCA_trigger(idx);
+					break;
+				}
+				default:
+				{
+					printk(KERN_WARNING "broken BIOS!!\n");
+					trigger = 1;
+					break;
+				}
+			}
+#endif
+			break;
+		case 1: /* edge */
+		{
+			trigger = 0;
+			break;
+		}
+		case 2: /* reserved */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			trigger = 1;
+			break;
+		}
+		case 3: /* level */
+		{
+			trigger = 1;
+			break;
+		}
+		default: /* invalid */
+		{
+			printk(KERN_WARNING "broken BIOS!!\n");
+			trigger = 0;
+			break;
+		}
+	}
+	return trigger;
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+	int irq;
+	int bus = mp_irqs[idx].srcbus;
+	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
+
+	/*
+	 * Debugging check, we are in big trouble if this message pops up!
+	 */
+	if (mp_irqs[idx].dstirq != pin)
+		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+	if (test_bit(bus, mp_bus_not_pci)) {
+		irq = mp_irqs[idx].srcbusirq;
+	} else {
+		u32 gsi = gsi_cfg->gsi_base + pin;
+
+		if (gsi >= NR_IRQS_LEGACY)
+			irq = gsi;
+		else
+			irq = gsi_top + gsi;
+	}
+
+#ifdef CONFIG_X86_32
+	/*
+	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
+	 */
+	if ((pin >= 16) && (pin <= 23)) {
+		if (pirq_entries[pin-16] != -1) {
+			if (!pirq_entries[pin-16]) {
+				apic_printk(APIC_VERBOSE, KERN_DEBUG
+						"disabling PIRQ%d\n", pin-16);
+			} else {
+				irq = pirq_entries[pin-16];
+				apic_printk(APIC_VERBOSE, KERN_DEBUG
+						"using PIRQ%d -> IRQ %d\n",
+						pin-16, irq);
+			}
+		}
+	}
+#endif
+
+	return irq;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+				struct io_apic_irq_attr *irq_attr)
+{
+	int ioapic_idx, i, best_guess = -1;
+
+	apic_printk(APIC_DEBUG,
+		    "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+		    bus, slot, pin);
+	if (test_bit(bus, mp_bus_not_pci)) {
+		apic_printk(APIC_VERBOSE,
+			    "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+		return -1;
+	}
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].srcbus;
+
+		for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+			if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
+			    mp_irqs[i].dstapic == MP_APIC_ALL)
+				break;
+
+		if (!test_bit(lbus, mp_bus_not_pci) &&
+		    !mp_irqs[i].irqtype &&
+		    (bus == lbus) &&
+		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq);
+
+			if (!(ioapic_idx || IO_APIC_IRQ(irq)))
+				continue;
+
+			if (pin == (mp_irqs[i].srcbusirq & 3)) {
+				set_io_apic_irq_attr(irq_attr, ioapic_idx,
+						     mp_irqs[i].dstirq,
+						     irq_trigger(i),
+						     irq_polarity(i));
+				return irq;
+			}
+			/*
+			 * Use the first all-but-pin matching entry as a
+			 * best-guess fuzzy result for broken mptables.
+			 */
+			if (best_guess < 0) {
+				set_io_apic_irq_attr(irq_attr, ioapic_idx,
+						     mp_irqs[i].dstirq,
+						     irq_trigger(i),
+						     irq_polarity(i));
+				best_guess = irq;
+			}
+		}
+	}
+	return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+#ifndef CONFIG_XEN
+void lock_vector_lock(void)
+{
+	/* Used to the online set of cpus does not change
+	 * during assign_irq_vector.
+	 */
+	raw_spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+	raw_spin_unlock(&vector_lock);
+}
+
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+	/*
+	 * NOTE! The local APIC isn't very good at handling
+	 * multiple interrupts at the same interrupt level.
+	 * As the interrupt level is determined by taking the
+	 * vector number and shifting that right by 4, we
+	 * want to spread these out a bit so that they don't
+	 * all fall in the same interrupt level.
+	 *
+	 * Also, we've got to be careful not to trash gate
+	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
+	 */
+	static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
+	static int current_offset = VECTOR_OFFSET_START % 8;
+	unsigned int old_vector;
+	int cpu, err;
+	cpumask_var_t tmp_mask;
+
+	if (cfg->move_in_progress)
+		return -EBUSY;
+
+	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	old_vector = cfg->vector;
+	if (old_vector) {
+		cpumask_and(tmp_mask, mask, cpu_online_mask);
+		cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+		if (!cpumask_empty(tmp_mask)) {
+			free_cpumask_var(tmp_mask);
+			return 0;
+		}
+	}
+
+	/* Only try and allocate irqs on cpus that are present */
+	err = -ENOSPC;
+	for_each_cpu_and(cpu, mask, cpu_online_mask) {
+		int new_cpu;
+		int vector, offset;
+
+		apic->vector_allocation_domain(cpu, tmp_mask);
+
+		vector = current_vector;
+		offset = current_offset;
+next:
+		vector += 8;
+		if (vector >= first_system_vector) {
+			/* If out of vectors on large boxen, must share them. */
+			offset = (offset + 1) % 8;
+			vector = FIRST_EXTERNAL_VECTOR + offset;
+		}
+		if (unlikely(current_vector == vector))
+			continue;
+
+		if (test_bit(vector, used_vectors))
+			goto next;
+
+#ifdef CONFIG_KDB
+		if (vector == KDBENTER_VECTOR)
+			goto next;
+#endif	/* CONFIG_KDB */
+		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+				goto next;
+		/* Found one! */
+		current_vector = vector;
+		current_offset = offset;
+		if (old_vector) {
+			cfg->move_in_progress = 1;
+			cpumask_copy(cfg->old_domain, cfg->domain);
+		}
+		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+			per_cpu(vector_irq, new_cpu)[vector] = irq;
+		cfg->vector = vector;
+		cpumask_copy(cfg->domain, tmp_mask);
+		err = 0;
+		break;
+	}
+	free_cpumask_var(tmp_mask);
+	return err;
+}
+
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+	int err;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&vector_lock, flags);
+	err = __assign_irq_vector(irq, cfg, mask);
+	raw_spin_unlock_irqrestore(&vector_lock, flags);
+	return err;
+}
+
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
+{
+	int cpu, vector;
+
+	BUG_ON(!cfg->vector);
+
+	vector = cfg->vector;
+	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+		per_cpu(vector_irq, cpu)[vector] = -1;
+
+	cfg->vector = 0;
+	cpumask_clear(cfg->domain);
+
+	if (likely(!cfg->move_in_progress))
+		return;
+	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+								vector++) {
+			if (per_cpu(vector_irq, cpu)[vector] != irq)
+				continue;
+			per_cpu(vector_irq, cpu)[vector] = -1;
+			break;
+		}
+	}
+	cfg->move_in_progress = 0;
+}
+
+void __setup_vector_irq(int cpu)
+{
+	/* Initialize vector_irq on a new cpu */
+	int irq, vector;
+	struct irq_cfg *cfg;
+
+	/*
+	 * vector_lock will make sure that we don't run into irq vector
+	 * assignments that might be happening on another cpu in parallel,
+	 * while we setup our initial vector to irq mappings.
+	 */
+	raw_spin_lock(&vector_lock);
+	/* Mark the inuse vectors */
+	for_each_active_irq(irq) {
+		cfg = irq_get_chip_data(irq);
+		if (!cfg)
+			continue;
+		/*
+		 * If it is a legacy IRQ handled by the legacy PIC, this cpu
+		 * will be part of the irq_cfg's domain.
+		 */
+		if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
+			cpumask_set_cpu(cpu, cfg->domain);
+
+		if (!cpumask_test_cpu(cpu, cfg->domain))
+			continue;
+		vector = cfg->vector;
+		per_cpu(vector_irq, cpu)[vector] = irq;
+	}
+	/* Mark the free vectors */
+	for (vector = 0; vector < NR_VECTORS; ++vector) {
+		irq = per_cpu(vector_irq, cpu)[vector];
+		if (irq < 0)
+			continue;
+
+		cfg = irq_cfg(irq);
+		if (!cpumask_test_cpu(cpu, cfg->domain))
+			per_cpu(vector_irq, cpu)[vector] = -1;
+	}
+	raw_spin_unlock(&vector_lock);
+}
+
+static struct irq_chip ioapic_chip;
+
+#ifdef CONFIG_X86_32
+static inline int IO_APIC_irq_trigger(int irq)
+{
+	int apic, idx, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+			idx = find_irq_entry(apic, pin, mp_INT);
+			if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
+				return irq_trigger(idx);
+		}
+	}
+	/*
+         * nonexistent IRQs are edge default
+         */
+	return 0;
+}
+#else
+static inline int IO_APIC_irq_trigger(int irq)
+{
+	return 1;
+}
+#endif
+
+static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
+				 unsigned long trigger)
+{
+	struct irq_chip *chip = &ioapic_chip;
+	irq_flow_handler_t hdl;
+	bool fasteoi;
+
+	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+	    trigger == IOAPIC_LEVEL) {
+		irq_set_status_flags(irq, IRQ_LEVEL);
+		fasteoi = true;
+	} else {
+		irq_clear_status_flags(irq, IRQ_LEVEL);
+		fasteoi = false;
+	}
+
+	if (irq_remapped(cfg)) {
+		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+		irq_remap_modify_chip_defaults(chip);
+		fasteoi = trigger != 0;
+	}
+
+	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+	irq_set_chip_and_handler_name(irq, chip, hdl,
+				      fasteoi ? "fasteoi" : "edge");
+}
+
+
+static int setup_ir_ioapic_entry(int irq,
+			      struct IR_IO_APIC_route_entry *entry,
+			      unsigned int destination, int vector,
+			      struct io_apic_irq_attr *attr)
+{
+	int index;
+	struct irte irte;
+	int ioapic_id = mpc_ioapic_id(attr->ioapic);
+	struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
+
+	if (!iommu) {
+		pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
+		return -ENODEV;
+	}
+
+	index = alloc_irte(iommu, irq, 1);
+	if (index < 0) {
+		pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
+		return -ENOMEM;
+	}
+
+	prepare_irte(&irte, vector, destination);
+
+	/* Set source-id of interrupt request */
+	set_ioapic_sid(&irte, ioapic_id);
+
+	modify_irte(irq, &irte);
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+		"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+		"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+		"Avail:%X Vector:%02X Dest:%08X "
+		"SID:%04X SQ:%X SVT:%X)\n",
+		attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
+		irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+		irte.avail, irte.vector, irte.dest_id,
+		irte.sid, irte.sq, irte.svt);
+
+	memset(entry, 0, sizeof(*entry));
+
+	entry->index2	= (index >> 15) & 0x1;
+	entry->zero	= 0;
+	entry->format	= 1;
+	entry->index	= (index & 0x7fff);
+	/*
+	 * IO-APIC RTE will be configured with virtual vector.
+	 * irq handler will do the explicit EOI to the io-apic.
+	 */
+	entry->vector	= attr->ioapic_pin;
+	entry->mask	= 0;			/* enable IRQ */
+	entry->trigger	= attr->trigger;
+	entry->polarity	= attr->polarity;
+
+	/* Mask level triggered irqs.
+	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+	 */
+	if (attr->trigger)
+		entry->mask = 1;
+
+	return 0;
+}
+#else /* !CONFIG_XEN */
+#define __clear_irq_vector(irq, cfg) ((void)0)
+#define ioapic_register_intr(irq, cfg, trigger) evtchn_register_pirq(irq)
+#endif
+
+static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
+			       unsigned int destination, int vector,
+			       struct io_apic_irq_attr *attr)
+{
+#ifndef CONFIG_XEN
+	if (intr_remapping_enabled)
+		return setup_ir_ioapic_entry(irq,
+			 (struct IR_IO_APIC_route_entry *)entry,
+			 destination, vector, attr);
+#endif
+
+	memset(entry, 0, sizeof(*entry));
+
+	entry->delivery_mode = apic->irq_delivery_mode;
+	entry->dest_mode     = apic->irq_dest_mode;
+	entry->dest	     = destination;
+	entry->vector	     = vector;
+	entry->mask	     = 0;			/* enable IRQ */
+	entry->trigger	     = attr->trigger;
+	entry->polarity	     = attr->polarity;
+
+	/*
+	 * Mask level triggered irqs.
+	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+	 */
+	if (attr->trigger)
+		entry->mask = 1;
+
+	return 0;
+}
+
+static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
+				struct io_apic_irq_attr *attr)
+{
+	struct IO_APIC_route_entry entry;
+	unsigned int dest;
+
+	if (!IO_APIC_IRQ(irq))
+		return;
+#ifndef CONFIG_XEN
+	/*
+	 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
+	 * controllers like 8259. Now that IO-APIC can handle this irq, update
+	 * the cfg->domain.
+	 */
+	if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
+		apic->vector_allocation_domain(0, cfg->domain);
+#else
+	/*
+	 * For legacy IRQs we may get here before trigger mode and polarity
+	 * get obtained, but Xen refuses to set those through
+	 * PHYSDEVOP_setup_gsi more than once (perhaps even at all).
+	 */
+	if (irq >= legacy_pic->nr_legacy_irqs
+	    || test_bit(attr->ioapic_pin,
+			ioapics[attr->ioapic].pin_programmed)) {
+		struct physdev_setup_gsi setup_gsi = {
+			.gsi = irq,
+			.triggering = attr->trigger,
+			.polarity = attr->polarity
+		};
+		struct physdev_map_pirq map_pirq = {
+			.domid = DOMID_SELF,
+			.type = MAP_PIRQ_TYPE_GSI,
+			.index = irq,
+			.pirq = irq
+		};
+
+		switch (HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
+					      &setup_gsi)) {
+		case -EEXIST:
+			if (irq < legacy_pic->nr_legacy_irqs)
+				break;
+			/* fall through */
+		case 0:
+			evtchn_register_pirq(irq);
+			if (HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
+						  &map_pirq) == 0) {
+				/* fake (for init_IO_APIC_traps()): */
+				cfg->vector = irq;
+				return;
+			}
+		}
+	}
+#endif
+
+	if (assign_irq_vector(irq, cfg, apic->target_cpus()))
+		return;
+
+#ifndef CONFIG_XEN
+	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+#else
+	dest = 0; /* meaningless */
+#endif
+
+	apic_printk(APIC_VERBOSE,KERN_DEBUG
+		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
+		    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
+		    attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
+		    cfg->vector, irq, attr->trigger, attr->polarity, dest);
+
+	if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
+		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
+			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
+		__clear_irq_vector(irq, cfg);
+
+		return;
+	}
+
+	ioapic_register_intr(irq, cfg, attr->trigger);
+#ifndef CONFIG_XEN
+	if (irq < legacy_pic->nr_legacy_irqs)
+		legacy_pic->mask(irq);
+#endif
+
+	ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
+}
+
+static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin)
+{
+	if (idx != -1)
+		return false;
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
+		    mpc_ioapic_id(ioapic_idx), pin);
+	return true;
+}
+
+static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
+{
+	int idx, node = cpu_to_node(0);
+	struct io_apic_irq_attr attr;
+	unsigned int pin, irq;
+
+	for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) {
+		idx = find_irq_entry(ioapic_idx, pin, mp_INT);
+		if (io_apic_pin_not_connected(idx, ioapic_idx, pin))
+			continue;
+
+		irq = pin_2_irq(idx, ioapic_idx, pin);
+
+		if ((ioapic_idx > 0) && (irq > 16))
+			continue;
+
+#ifdef CONFIG_XEN
+		if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+			continue;
+#else
+		/*
+		 * Skip the timer IRQ if there's a quirk handler
+		 * installed and if it returns 1:
+		 */
+		if (apic->multi_timer_check &&
+		    apic->multi_timer_check(ioapic_idx, irq))
+			continue;
+#endif
+
+		set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
+				     irq_polarity(idx));
+
+		io_apic_setup_irq_pin(irq, node, &attr);
+	}
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+	unsigned int ioapic_idx;
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+		__io_apic_setup_irqs(ioapic_idx);
+}
+
+/*
+ * for the gsit that is not in first ioapic
+ * but could not use acpi_register_gsi()
+ * like some special sci in IBM x3330
+ */
+void setup_IO_APIC_irq_extra(u32 gsi)
+{
+	int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0);
+	struct io_apic_irq_attr attr;
+
+	/*
+	 * Convert 'gsi' to 'ioapic.pin'.
+	 */
+	ioapic_idx = mp_find_ioapic(gsi);
+	if (ioapic_idx < 0)
+		return;
+
+	pin = mp_find_ioapic_pin(ioapic_idx, gsi);
+	idx = find_irq_entry(ioapic_idx, pin, mp_INT);
+	if (idx == -1)
+		return;
+
+	irq = pin_2_irq(idx, ioapic_idx, pin);
+#ifdef CONFIG_XEN
+	if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+		return;
+#endif
+
+	/* Only handle the non legacy irqs on secondary ioapics */
+	if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY)
+		return;
+
+	set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
+			     irq_polarity(idx));
+
+	io_apic_setup_irq_pin_once(irq, node, &attr);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Set up the timer pin, possibly with the 8259A-master behind.
+ */
+static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
+					 unsigned int pin, int vector)
+{
+	struct IO_APIC_route_entry entry;
+
+	if (intr_remapping_enabled)
+		return;
+
+	memset(&entry, 0, sizeof(entry));
+
+	/*
+	 * We use logical delivery to get the timer IRQ
+	 * to the first CPU.
+	 */
+	entry.dest_mode = apic->irq_dest_mode;
+	entry.mask = 0;			/* don't mask IRQ for edge */
+	entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
+	entry.delivery_mode = apic->irq_delivery_mode;
+	entry.polarity = 0;
+	entry.trigger = 0;
+	entry.vector = vector;
+
+	/*
+	 * The timer IRQ doesn't have to know that behind the
+	 * scene we may have a 8259A-master in AEOI mode ...
+	 */
+	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+				      "edge");
+
+	/*
+	 * Add it to the IO-APIC irq-routing table:
+	 */
+	ioapic_write_entry(ioapic_idx, pin, entry);
+}
+
+__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
+{
+	int i;
+	union IO_APIC_reg_00 reg_00;
+	union IO_APIC_reg_01 reg_01;
+	union IO_APIC_reg_02 reg_02;
+	union IO_APIC_reg_03 reg_03;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(ioapic_idx, 0);
+	reg_01.raw = io_apic_read(ioapic_idx, 1);
+	if (reg_01.bits.version >= 0x10)
+		reg_02.raw = io_apic_read(ioapic_idx, 2);
+	if (reg_01.bits.version >= 0x20)
+		reg_03.raw = io_apic_read(ioapic_idx, 3);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	printk("\n");
+	printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
+	printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
+
+	printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+	printk(KERN_DEBUG ".......     : max redirection entries: %02X\n",
+		reg_01.bits.entries);
+
+	printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
+	printk(KERN_DEBUG ".......     : IO APIC version: %02X\n",
+		reg_01.bits.version);
+
+	/*
+	 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+	 * but the value of reg_02 is read as the previous read register
+	 * value, so ignore it if reg_02 == reg_01.
+	 */
+	if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+		printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+		printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
+	}
+
+	/*
+	 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+	 * or reg_03, but the value of reg_0[23] is read as the previous read
+	 * register value, so ignore it if reg_03 == reg_0[12].
+	 */
+	if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+	    reg_03.raw != reg_01.raw) {
+		printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+		printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
+	}
+
+	printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+	if (intr_remapping_enabled) {
+		printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
+			" Pol Stat Indx2 Zero Vect:\n");
+	} else {
+		printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+			" Stat Dmod Deli Vect:\n");
+	}
+
+	for (i = 0; i <= reg_01.bits.entries; i++) {
+		if (intr_remapping_enabled) {
+			struct IO_APIC_route_entry entry;
+			struct IR_IO_APIC_route_entry *ir_entry;
+
+			entry = ioapic_read_entry(ioapic_idx, i);
+			ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
+			printk(KERN_DEBUG " %02x %04X ",
+				i,
+				ir_entry->index
+			);
+			printk("%1d   %1d    %1d    %1d   %1d   "
+				"%1d    %1d     %X    %02X\n",
+				ir_entry->format,
+				ir_entry->mask,
+				ir_entry->trigger,
+				ir_entry->irr,
+				ir_entry->polarity,
+				ir_entry->delivery_status,
+				ir_entry->index2,
+				ir_entry->zero,
+				ir_entry->vector
+			);
+		} else {
+			struct IO_APIC_route_entry entry;
+
+			entry = ioapic_read_entry(ioapic_idx, i);
+			printk(KERN_DEBUG " %02x %02X  ",
+				i,
+				entry.dest
+			);
+			printk("%1d    %1d    %1d   %1d   %1d    "
+				"%1d    %1d    %02X\n",
+				entry.mask,
+				entry.trigger,
+				entry.irr,
+				entry.polarity,
+				entry.delivery_status,
+				entry.dest_mode,
+				entry.delivery_mode,
+				entry.vector
+			);
+		}
+	}
+}
+
+__apicdebuginit(void) print_IO_APICs(void)
+{
+	int ioapic_idx;
+	struct irq_cfg *cfg;
+	unsigned int irq;
+	struct irq_chip *chip;
+
+	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+		       mpc_ioapic_id(ioapic_idx),
+		       ioapics[ioapic_idx].nr_registers);
+
+	/*
+	 * We are a bit conservative about what we expect.  We have to
+	 * know about every hardware change ASAP.
+	 */
+	printk(KERN_INFO "testing the IO APIC.......................\n");
+
+	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+		print_IO_APIC(ioapic_idx);
+
+	printk(KERN_DEBUG "IRQ to pin mappings:\n");
+	for_each_active_irq(irq) {
+		struct irq_pin_list *entry;
+
+		chip = irq_get_chip(irq);
+		if (chip != &ioapic_chip)
+			continue;
+
+		cfg = irq_get_chip_data(irq);
+		if (!cfg)
+			continue;
+		entry = cfg->irq_2_pin;
+		if (!entry)
+			continue;
+		printk(KERN_DEBUG "IRQ%d ", irq);
+		for_each_irq_pin(entry, cfg->irq_2_pin)
+			printk("-> %d:%d", entry->apic, entry->pin);
+		printk("\n");
+	}
+
+	printk(KERN_INFO ".................................... done.\n");
+}
+
+__apicdebuginit(void) print_APIC_field(int base)
+{
+	int i;
+
+	printk(KERN_DEBUG);
+
+	for (i = 0; i < 8; i++)
+		printk(KERN_CONT "%08x", apic_read(base + i*0x10));
+
+	printk(KERN_CONT "\n");
+}
+
+__apicdebuginit(void) print_local_APIC(void *dummy)
+{
+	unsigned int i, v, ver, maxlvt;
+	u64 icr;
+
+	printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+		smp_processor_id(), hard_smp_processor_id());
+	v = apic_read(APIC_ID);
+	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, read_apic_id());
+	v = apic_read(APIC_LVR);
+	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+	ver = GET_APIC_VERSION(v);
+	maxlvt = lapic_get_maxlvt();
+
+	v = apic_read(APIC_TASKPRI);
+	printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+	if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
+		if (!APIC_XAPIC(ver)) {
+			v = apic_read(APIC_ARBPRI);
+			printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+			       v & APIC_ARBPRI_MASK);
+		}
+		v = apic_read(APIC_PROCPRI);
+		printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+	}
+
+	/*
+	 * Remote read supported only in the 82489DX and local APIC for
+	 * Pentium processors.
+	 */
+	if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+		v = apic_read(APIC_RRR);
+		printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+	}
+
+	v = apic_read(APIC_LDR);
+	printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+	if (!x2apic_enabled()) {
+		v = apic_read(APIC_DFR);
+		printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+	}
+	v = apic_read(APIC_SPIV);
+	printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+	printk(KERN_DEBUG "... APIC ISR field:\n");
+	print_APIC_field(APIC_ISR);
+	printk(KERN_DEBUG "... APIC TMR field:\n");
+	print_APIC_field(APIC_TMR);
+	printk(KERN_DEBUG "... APIC IRR field:\n");
+	print_APIC_field(APIC_IRR);
+
+	if (APIC_INTEGRATED(ver)) {             /* !82489DX */
+		if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+			apic_write(APIC_ESR, 0);
+
+		v = apic_read(APIC_ESR);
+		printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+	}
+
+	icr = apic_icr_read();
+	printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
+	printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
+
+	v = apic_read(APIC_LVTT);
+	printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+	if (maxlvt > 3) {                       /* PC is LVT#4. */
+		v = apic_read(APIC_LVTPC);
+		printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+	}
+	v = apic_read(APIC_LVT0);
+	printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+	v = apic_read(APIC_LVT1);
+	printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+	if (maxlvt > 2) {			/* ERR is LVT#3. */
+		v = apic_read(APIC_LVTERR);
+		printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+	}
+
+	v = apic_read(APIC_TMICT);
+	printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+	v = apic_read(APIC_TMCCT);
+	printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+	v = apic_read(APIC_TDCR);
+	printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+	if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+		v = apic_read(APIC_EFEAT);
+		maxlvt = (v >> 16) & 0xff;
+		printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+		v = apic_read(APIC_ECTRL);
+		printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+		for (i = 0; i < maxlvt; i++) {
+			v = apic_read(APIC_EILVTn(i));
+			printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+		}
+	}
+	printk("\n");
+}
+
+__apicdebuginit(void) print_local_APICs(int maxcpu)
+{
+	int cpu;
+
+	if (!maxcpu)
+		return;
+
+	preempt_disable();
+	for_each_online_cpu(cpu) {
+		if (cpu >= maxcpu)
+			break;
+		smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+	}
+	preempt_enable();
+}
+
+__apicdebuginit(void) print_PIC(void)
+{
+	unsigned int v;
+	unsigned long flags;
+
+	if (!legacy_pic->nr_legacy_irqs)
+		return;
+
+	printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+	raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+	v = inb(0xa1) << 8 | inb(0x21);
+	printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+
+	v = inb(0xa0) << 8 | inb(0x20);
+	printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+
+	outb(0x0b,0xa0);
+	outb(0x0b,0x20);
+	v = inb(0xa0) << 8 | inb(0x20);
+	outb(0x0a,0xa0);
+	outb(0x0a,0x20);
+
+	raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+	printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+
+	v = inb(0x4d1) << 8 | inb(0x4d0);
+	printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+
+static int __initdata show_lapic = 1;
+static __init int setup_show_lapic(char *arg)
+{
+	int num = -1;
+
+	if (strcmp(arg, "all") == 0) {
+		show_lapic = CONFIG_NR_CPUS;
+	} else {
+		get_option(&arg, &num);
+		if (num >= 0)
+			show_lapic = num;
+	}
+
+	return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+__apicdebuginit(int) print_ICs(void)
+{
+	if (apic_verbosity == APIC_QUIET)
+		return 0;
+
+	print_PIC();
+
+	/* don't print out if apic is not there */
+	if (!cpu_has_apic && !apic_from_smp_config())
+		return 0;
+
+	print_local_APICs(show_lapic);
+	print_IO_APICs();
+
+	return 0;
+}
+
+late_initcall(print_ICs);
+
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+void __init enable_IO_APIC(void)
+{
+	int i8259_apic, i8259_pin;
+	int apic;
+
+	if (!legacy_pic->nr_legacy_irqs)
+		return;
+
+	for(apic = 0; apic < nr_ioapics; apic++) {
+		int pin;
+		/* See if any of the pins is in ExtINT mode */
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+			struct IO_APIC_route_entry entry;
+			entry = ioapic_read_entry(apic, pin);
+
+			/* If the interrupt line is enabled and in ExtInt mode
+			 * I have found the pin where the i8259 is connected.
+			 */
+			if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+				ioapic_i8259.apic = apic;
+				ioapic_i8259.pin  = pin;
+				goto found_i8259;
+			}
+		}
+	}
+ found_i8259:
+	/* Look to see what if the MP table has reported the ExtINT */
+	/* If we could not find the appropriate pin by looking at the ioapic
+	 * the i8259 probably is not connected the ioapic but give the
+	 * mptable a chance anyway.
+	 */
+	i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
+	i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+	/* Trust the MP table if nothing is setup in the hardware */
+	if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+		printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+		ioapic_i8259.pin  = i8259_pin;
+		ioapic_i8259.apic = i8259_apic;
+	}
+	/* Complain if the MP table and the hardware disagree */
+	if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+		(i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+	{
+		printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+	}
+
+	/*
+	 * Do not trust the IO-APIC being empty at bootup
+	 */
+	clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+	/*
+	 * Clear the IO-APIC before rebooting:
+	 */
+	clear_IO_APIC();
+
+	if (!legacy_pic->nr_legacy_irqs)
+		return;
+
+	/*
+	 * If the i8259 is routed through an IOAPIC
+	 * Put that IOAPIC in virtual wire mode
+	 * so legacy interrupts can be delivered.
+	 *
+	 * With interrupt-remapping, for now we will use virtual wire A mode,
+	 * as virtual wire B is little complex (need to configure both
+	 * IOAPIC RTE as well as interrupt-remapping table entry).
+	 * As this gets called during crash dump, keep this simple for now.
+	 */
+	if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+		struct IO_APIC_route_entry entry;
+
+		memset(&entry, 0, sizeof(entry));
+		entry.mask            = 0; /* Enabled */
+		entry.trigger         = 0; /* Edge */
+		entry.irr             = 0;
+		entry.polarity        = 0; /* High */
+		entry.delivery_status = 0;
+		entry.dest_mode       = 0; /* Physical */
+		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
+		entry.vector          = 0;
+		entry.dest            = read_apic_id();
+
+		/*
+		 * Add it to the IO-APIC irq-routing table:
+		 */
+		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+	}
+
+	/*
+	 * Use virtual wire A mode when interrupt remapping is enabled.
+	 */
+	if (cpu_has_apic || apic_from_smp_config())
+		disconnect_bsp_APIC(!intr_remapping_enabled &&
+				ioapic_i8259.pin != -1);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
+ */
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
+{
+	union IO_APIC_reg_00 reg_00;
+	physid_mask_t phys_id_present_map;
+	int ioapic_idx;
+	int i;
+	unsigned char old_id;
+	unsigned long flags;
+
+	/*
+	 * This is broken; anything with a real cpu count has to
+	 * circumvent this idiocy regardless.
+	 */
+	apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
+
+	/*
+	 * Set the IOAPIC ID to the value stored in the MPC table.
+	 */
+	for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+		/* Read the register 0 value */
+		raw_spin_lock_irqsave(&ioapic_lock, flags);
+		reg_00.raw = io_apic_read(ioapic_idx, 0);
+		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		old_id = mpc_ioapic_id(ioapic_idx);
+
+		if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) {
+			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+				ioapic_idx, mpc_ioapic_id(ioapic_idx));
+			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+				reg_00.bits.ID);
+			ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
+		}
+
+		/*
+		 * Sanity check, is the ID really free? Every APIC in a
+		 * system must have a unique ID or we get lots of nice
+		 * 'stuck on smp_invalidate_needed IPI wait' messages.
+		 */
+		if (apic->check_apicid_used(&phys_id_present_map,
+					    mpc_ioapic_id(ioapic_idx))) {
+			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+				ioapic_idx, mpc_ioapic_id(ioapic_idx));
+			for (i = 0; i < get_physical_broadcast(); i++)
+				if (!physid_isset(i, phys_id_present_map))
+					break;
+			if (i >= get_physical_broadcast())
+				panic("Max APIC ID exceeded!\n");
+			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+				i);
+			physid_set(i, phys_id_present_map);
+			ioapics[ioapic_idx].mp_config.apicid = i;
+		} else {
+			physid_mask_t tmp;
+			apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx),
+						    &tmp);
+			apic_printk(APIC_VERBOSE, "Setting %d in the "
+					"phys_id_present_map\n",
+					mpc_ioapic_id(ioapic_idx));
+			physids_or(phys_id_present_map, phys_id_present_map, tmp);
+		}
+
+		/*
+		 * We need to adjust the IRQ routing table
+		 * if the ID changed.
+		 */
+		if (old_id != mpc_ioapic_id(ioapic_idx))
+			for (i = 0; i < mp_irq_entries; i++)
+				if (mp_irqs[i].dstapic == old_id)
+					mp_irqs[i].dstapic
+						= mpc_ioapic_id(ioapic_idx);
+
+		/*
+		 * Update the ID register according to the right value
+		 * from the MPC table if they are different.
+		 */
+		if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
+			continue;
+
+		apic_printk(APIC_VERBOSE, KERN_INFO
+			"...changing IO-APIC physical APIC ID to %d ...",
+			mpc_ioapic_id(ioapic_idx));
+
+		reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+		raw_spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(ioapic_idx, 0, reg_00.raw);
+		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		/*
+		 * Sanity check
+		 */
+		raw_spin_lock_irqsave(&ioapic_lock, flags);
+		reg_00.raw = io_apic_read(ioapic_idx, 0);
+		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+		if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
+			printk("could not set ID!\n");
+		else
+			apic_printk(APIC_VERBOSE, " ok.\n");
+	}
+}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+	if (acpi_ioapic)
+		return;
+	/*
+	 * Don't check I/O APIC IDs for xAPIC systems.  They have
+	 * no meaning without the serial APIC bus.
+	 */
+	if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		|| APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return;
+	setup_ioapic_ids_from_mpc_nocheck();
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+	no_timer_check = 1;
+	return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *	- timer IRQ defaults to IO-APIC IRQ
+ *	- if this function detects that timer IRQs are defunct, then we fall
+ *	  back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+	unsigned long t1 = jiffies;
+	unsigned long flags;
+
+	if (no_timer_check)
+		return 1;
+
+	local_save_flags(flags);
+	local_irq_enable();
+	/* Let ten ticks pass... */
+	mdelay((10 * 1000) / HZ);
+	local_irq_restore(flags);
+
+	/*
+	 * Expect a few ticks at least, to be sure some possible
+	 * glue logic does not lock up after one or two first
+	 * ticks in a non-ExtINT mode.  Also the local APIC
+	 * might have cached one ExtINT interrupt.  Finally, at
+	 * least one tick may be lost due to delays.
+	 */
+
+	/* jiffies wrap? */
+	if (time_after(jiffies, t1 + 4))
+		return 1;
+	return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+
+static unsigned int startup_ioapic_irq(struct irq_data *data)
+{
+	int was_pending = 0, irq = data->irq;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	if (irq < legacy_pic->nr_legacy_irqs) {
+		legacy_pic->mask(irq);
+		if (legacy_pic->irq_pending(irq))
+			was_pending = 1;
+	}
+	__unmask_ioapic(data->chip_data);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return was_pending;
+}
+
+static int ioapic_retrigger_irq(struct irq_data *data)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&vector_lock, flags);
+	apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
+	raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+	return 1;
+}
+
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+
+#ifdef CONFIG_SMP
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+	cpumask_var_t cleanup_mask;
+
+	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+		unsigned int i;
+		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+	} else {
+		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+		free_cpumask_var(cleanup_mask);
+	}
+	cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+	int apic, pin;
+	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
+
+	for_each_irq_pin(entry, cfg->irq_2_pin) {
+		unsigned int reg;
+
+		apic = entry->apic;
+		pin = entry->pin;
+		/*
+		 * With interrupt-remapping, destination information comes
+		 * from interrupt-remapping table entry.
+		 */
+		if (!irq_remapped(cfg))
+			io_apic_write(apic, 0x11 + pin*2, dest);
+		reg = io_apic_read(apic, 0x10 + pin*2);
+		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+		reg |= vector;
+		io_apic_modify(apic, 0x10 + pin*2, reg);
+	}
+}
+
+/*
+ * Either sets data->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
+ * leaves data->affinity untouched.
+ */
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+			  unsigned int *dest_id)
+{
+	struct irq_cfg *cfg = data->chip_data;
+
+	if (!cpumask_intersects(mask, cpu_online_mask))
+		return -1;
+
+	if (assign_irq_vector(data->irq, data->chip_data, mask))
+		return -1;
+
+	cpumask_copy(data->affinity, mask);
+
+	*dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
+	return 0;
+}
+
+static int
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
+{
+	unsigned int dest, irq = data->irq;
+	unsigned long flags;
+	int ret;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	ret = __ioapic_set_affinity(data, mask, &dest);
+	if (!ret) {
+		/* Only the high 8 bits are valid. */
+		dest = SET_APIC_LOGICAL_ID(dest);
+		__target_IO_APIC_irq(irq, dest, data->chip_data);
+	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+	return ret;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
+ *
+ * As the migration is a simple atomic update of IRTE, the same mechanism
+ * is used to migrate MSI irq's in the presence of interrupt-remapping.
+ */
+static int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		       bool force)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int dest, irq = data->irq;
+	struct irte irte;
+
+	if (!cpumask_intersects(mask, cpu_online_mask))
+		return -EINVAL;
+
+	if (get_irte(irq, &irte))
+		return -EBUSY;
+
+	if (assign_irq_vector(irq, cfg, mask))
+		return -EBUSY;
+
+	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
+
+	irte.vector = cfg->vector;
+	irte.dest_id = IRTE_DEST(dest);
+
+	/*
+	 * Atomically updates the IRTE with the new destination, vector
+	 * and flushes the interrupt entry cache.
+	 */
+	modify_irte(irq, &irte);
+
+	/*
+	 * After this point, all the interrupts will start arriving
+	 * at the new destination. So, time to cleanup the previous
+	 * vector allocation.
+	 */
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
+
+	cpumask_copy(data->affinity, mask);
+	return 0;
+}
+
+#else
+static inline int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		       bool force)
+{
+	return 0;
+}
+#endif
+
+asmlinkage void smp_irq_move_cleanup_interrupt(void)
+{
+	unsigned vector, me;
+
+	ack_APIC_irq();
+	irq_enter();
+	exit_idle();
+
+	me = smp_processor_id();
+	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+		unsigned int irq;
+		unsigned int irr;
+		struct irq_desc *desc;
+		struct irq_cfg *cfg;
+		irq = __this_cpu_read(vector_irq[vector]);
+
+		if (irq == -1)
+			continue;
+
+		desc = irq_to_desc(irq);
+		if (!desc)
+			continue;
+
+		cfg = irq_cfg(irq);
+		raw_spin_lock(&desc->lock);
+
+		/*
+		 * Check if the irq migration is in progress. If so, we
+		 * haven't received the cleanup request yet for this irq.
+		 */
+		if (cfg->move_in_progress)
+			goto unlock;
+
+		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+			goto unlock;
+
+		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+		/*
+		 * Check if the vector that needs to be cleanedup is
+		 * registered at the cpu's IRR. If so, then this is not
+		 * the best time to clean it up. Lets clean it up in the
+		 * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+		 * to myself.
+		 */
+		if (irr  & (1 << (vector % 32))) {
+			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+			goto unlock;
+		}
+		__this_cpu_write(vector_irq[vector], -1);
+unlock:
+		raw_spin_unlock(&desc->lock);
+	}
+
+	irq_exit();
+}
+
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
+{
+	unsigned me;
+
+	if (likely(!cfg->move_in_progress))
+		return;
+
+	me = smp_processor_id();
+
+	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+		send_cleanup_vector(cfg);
+}
+
+static void irq_complete_move(struct irq_cfg *cfg)
+{
+	__irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
+}
+
+void irq_force_complete_move(int irq)
+{
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
+
+	if (!cfg)
+		return;
+
+	__irq_complete_move(cfg, cfg->vector);
+}
+#else
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
+#endif
+
+static void ack_apic_edge(struct irq_data *data)
+{
+	irq_complete_move(data->chip_data);
+	irq_move_irq(data);
+	ack_APIC_irq();
+}
+
+atomic_t irq_mis_count;
+
+static void ack_apic_level(struct irq_data *data)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	int i, do_unmask_irq = 0, irq = data->irq;
+	unsigned long v;
+
+	irq_complete_move(cfg);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	/* If we are moving the irq we need to mask it */
+	if (unlikely(irqd_is_setaffinity_pending(data))) {
+		do_unmask_irq = 1;
+		mask_ioapic(cfg);
+	}
+#endif
+
+	/*
+	 * It appears there is an erratum which affects at least version 0x11
+	 * of I/O APIC (that's the 82093AA and cores integrated into various
+	 * chipsets).  Under certain conditions a level-triggered interrupt is
+	 * erroneously delivered as edge-triggered one but the respective IRR
+	 * bit gets set nevertheless.  As a result the I/O unit expects an EOI
+	 * message but it will never arrive and further interrupts are blocked
+	 * from the source.  The exact reason is so far unknown, but the
+	 * phenomenon was observed when two consecutive interrupt requests
+	 * from a given source get delivered to the same CPU and the source is
+	 * temporarily disabled in between.
+	 *
+	 * A workaround is to simulate an EOI message manually.  We achieve it
+	 * by setting the trigger mode to edge and then to level when the edge
+	 * trigger mode gets detected in the TMR of a local APIC for a
+	 * level-triggered interrupt.  We mask the source for the time of the
+	 * operation to prevent an edge-triggered interrupt escaping meanwhile.
+	 * The idea is from Manfred Spraul.  --macro
+	 *
+	 * Also in the case when cpu goes offline, fixup_irqs() will forward
+	 * any unhandled interrupt on the offlined cpu to the new cpu
+	 * destination that is handling the corresponding interrupt. This
+	 * interrupt forwarding is done via IPI's. Hence, in this case also
+	 * level-triggered io-apic interrupt will be seen as an edge
+	 * interrupt in the IRR. And we can't rely on the cpu's EOI
+	 * to be broadcasted to the IO-APIC's which will clear the remoteIRR
+	 * corresponding to the level-triggered interrupt. Hence on IO-APIC's
+	 * supporting EOI register, we do an explicit EOI to clear the
+	 * remote IRR and on IO-APIC's which don't have an EOI register,
+	 * we use the above logic (mask+edge followed by unmask+level) from
+	 * Manfred Spraul to clear the remote IRR.
+	 */
+	i = cfg->vector;
+	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+	/*
+	 * We must acknowledge the irq before we move it or the acknowledge will
+	 * not propagate properly.
+	 */
+	ack_APIC_irq();
+
+	/*
+	 * Tail end of clearing remote IRR bit (either by delivering the EOI
+	 * message via io-apic EOI register write or simulating it using
+	 * mask+edge followed by unnask+level logic) manually when the
+	 * level triggered interrupt is seen as the edge triggered interrupt
+	 * at the cpu.
+	 */
+	if (!(v & (1 << (i & 0x1f)))) {
+		atomic_inc(&irq_mis_count);
+
+		eoi_ioapic_irq(irq, cfg);
+	}
+
+	/* Now we can move and renable the irq */
+	if (unlikely(do_unmask_irq)) {
+		/* Only migrate the irq if the ack has been received.
+		 *
+		 * On rare occasions the broadcast level triggered ack gets
+		 * delayed going to ioapics, and if we reprogram the
+		 * vector while Remote IRR is still set the irq will never
+		 * fire again.
+		 *
+		 * To prevent this scenario we read the Remote IRR bit
+		 * of the ioapic.  This has two effects.
+		 * - On any sane system the read of the ioapic will
+		 *   flush writes (and acks) going to the ioapic from
+		 *   this cpu.
+		 * - We get to see if the ACK has actually been delivered.
+		 *
+		 * Based on failed experiments of reprogramming the
+		 * ioapic entry from outside of irq context starting
+		 * with masking the ioapic entry and then polling until
+		 * Remote IRR was clear before reprogramming the
+		 * ioapic I don't trust the Remote IRR bit to be
+		 * completey accurate.
+		 *
+		 * However there appears to be no other way to plug
+		 * this race, so if the Remote IRR bit is not
+		 * accurate and is causing problems then it is a hardware bug
+		 * and you can go talk to the chipset vendor about it.
+		 */
+		if (!io_apic_level_ack_pending(cfg))
+			irq_move_masked_irq(data);
+		unmask_ioapic(cfg);
+	}
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static void ir_ack_apic_edge(struct irq_data *data)
+{
+	ack_APIC_irq();
+}
+
+static void ir_ack_apic_level(struct irq_data *data)
+{
+	ack_APIC_irq();
+	eoi_ioapic_irq(data->irq, data->chip_data);
+}
+
+static void ir_print_prefix(struct irq_data *data, struct seq_file *p)
+{
+	seq_printf(p, " IR-%s", data->chip->name);
+}
+
+static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+{
+	chip->irq_print_chip = ir_print_prefix;
+	chip->irq_ack = ir_ack_apic_edge;
+	chip->irq_eoi = ir_ack_apic_level;
+
+#ifdef CONFIG_SMP
+	chip->irq_set_affinity = ir_ioapic_set_affinity;
+#endif
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+static struct irq_chip ioapic_chip __read_mostly = {
+	.name			= "IO-APIC",
+	.irq_startup		= startup_ioapic_irq,
+	.irq_mask		= mask_ioapic_irq,
+	.irq_unmask		= unmask_ioapic_irq,
+	.irq_ack		= ack_apic_edge,
+	.irq_eoi		= ack_apic_level,
+#ifdef CONFIG_SMP
+	.irq_set_affinity	= ioapic_set_affinity,
+#endif
+	.irq_retrigger		= ioapic_retrigger_irq,
+};
+#endif /* !CONFIG_XEN */
+
+static inline void init_IO_APIC_traps(void)
+{
+	struct irq_cfg *cfg;
+	unsigned int irq;
+
+	/*
+	 * NOTE! The local APIC isn't very good at handling
+	 * multiple interrupts at the same interrupt level.
+	 * As the interrupt level is determined by taking the
+	 * vector number and shifting that right by 4, we
+	 * want to spread these out a bit so that they don't
+	 * all fall in the same interrupt level.
+	 *
+	 * Also, we've got to be careful not to trash gate
+	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
+	 */
+	for_each_active_irq(irq) {
+#ifdef CONFIG_XEN
+		if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+			continue;
+#endif
+		cfg = irq_get_chip_data(irq);
+		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
+			/*
+			 * Hmm.. We don't have an entry for this,
+			 * so default to an old-fashioned 8259
+			 * interrupt if we can..
+			 */
+			if (irq < legacy_pic->nr_legacy_irqs)
+				legacy_pic->make_irq(irq);
+			else
+				/* Strange. Oh, well.. */
+				irq_set_chip(irq, &no_irq_chip);
+		}
+	}
+}
+
+#ifndef CONFIG_XEN
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void mask_lapic_irq(struct irq_data *data)
+{
+	unsigned long v;
+
+	v = apic_read(APIC_LVT0);
+	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void unmask_lapic_irq(struct irq_data *data)
+{
+	unsigned long v;
+
+	v = apic_read(APIC_LVT0);
+	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq(struct irq_data *data)
+{
+	ack_APIC_irq();
+}
+
+static struct irq_chip lapic_chip __read_mostly = {
+	.name		= "local-APIC",
+	.irq_mask	= mask_lapic_irq,
+	.irq_unmask	= unmask_lapic_irq,
+	.irq_ack	= ack_lapic_irq,
+};
+
+static void lapic_register_intr(int irq)
+{
+	irq_clear_status_flags(irq, IRQ_LEVEL);
+	irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+				      "edge");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void __init unlock_ExtINT_logic(void)
+{
+	int apic, pin, i;
+	struct IO_APIC_route_entry entry0, entry1;
+	unsigned char save_control, save_freq_select;
+
+	pin  = find_isa_irq_pin(8, mp_INT);
+	if (pin == -1) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+	apic = find_isa_irq_apic(8, mp_INT);
+	if (apic == -1) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	entry0 = ioapic_read_entry(apic, pin);
+	clear_IO_APIC_pin(apic, pin);
+
+	memset(&entry1, 0, sizeof(entry1));
+
+	entry1.dest_mode = 0;			/* physical delivery */
+	entry1.mask = 0;			/* unmask IRQ now */
+	entry1.dest = hard_smp_processor_id();
+	entry1.delivery_mode = dest_ExtINT;
+	entry1.polarity = entry0.polarity;
+	entry1.trigger = 0;
+	entry1.vector = 0;
+
+	ioapic_write_entry(apic, pin, entry1);
+
+	save_control = CMOS_READ(RTC_CONTROL);
+	save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+	CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+		   RTC_FREQ_SELECT);
+	CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+	i = 100;
+	while (i-- > 0) {
+		mdelay(10);
+		if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+			i -= 10;
+	}
+
+	CMOS_WRITE(save_control, RTC_CONTROL);
+	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+	clear_IO_APIC_pin(apic, pin);
+
+	ioapic_write_entry(apic, pin, entry0);
+}
+
+static int disable_timer_pin_1 __initdata;
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+	disable_timer_pin_1 = 1;
+	return 0;
+}
+early_param("disable_timer_pin_1", disable_timer_pin_setup);
+
+int timer_through_8259 __initdata;
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for all platforms.
+ */
+static inline void __init check_timer(void)
+{
+	struct irq_cfg *cfg = irq_get_chip_data(0);
+	int node = cpu_to_node(0);
+	int apic1, pin1, apic2, pin2;
+	unsigned long flags;
+	int no_pin1 = 0;
+
+	local_irq_save(flags);
+
+	/*
+	 * get/set the timer IRQ vector:
+	 */
+	legacy_pic->mask(0);
+	assign_irq_vector(0, cfg, apic->target_cpus());
+
+	/*
+	 * As IRQ0 is to be enabled in the 8259A, the virtual
+	 * wire has to be disabled in the local APIC.  Also
+	 * timer interrupts need to be acknowledged manually in
+	 * the 8259A for the i82489DX when using the NMI
+	 * watchdog as that APIC treats NMIs as level-triggered.
+	 * The AEOI mode will finish them in the 8259A
+	 * automatically.
+	 */
+	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+	legacy_pic->init(1);
+
+	pin1  = find_isa_irq_pin(0, mp_INT);
+	apic1 = find_isa_irq_apic(0, mp_INT);
+	pin2  = ioapic_i8259.pin;
+	apic2 = ioapic_i8259.apic;
+
+	apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+		    "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+		    cfg->vector, apic1, pin1, apic2, pin2);
+
+	/*
+	 * Some BIOS writers are clueless and report the ExtINTA
+	 * I/O APIC input from the cascaded 8259A as the timer
+	 * interrupt input.  So just in case, if only one pin
+	 * was found above, try it both directly and through the
+	 * 8259A.
+	 */
+	if (pin1 == -1) {
+		if (intr_remapping_enabled)
+			panic("BIOS bug: timer not connected to IO-APIC");
+		pin1 = pin2;
+		apic1 = apic2;
+		no_pin1 = 1;
+	} else if (pin2 == -1) {
+		pin2 = pin1;
+		apic2 = apic1;
+	}
+
+	if (pin1 != -1) {
+		/*
+		 * Ok, does IRQ0 through the IOAPIC work?
+		 */
+		if (no_pin1) {
+			add_pin_to_irq_node(cfg, node, apic1, pin1);
+			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+		} else {
+			/* for edge trigger, setup_ioapic_irq already
+			 * leave it unmasked.
+			 * so only need to unmask if it is level-trigger
+			 * do we really have level trigger timer?
+			 */
+			int idx;
+			idx = find_irq_entry(apic1, pin1, mp_INT);
+			if (idx != -1 && irq_trigger(idx))
+				unmask_ioapic(cfg);
+		}
+		if (timer_irq_works()) {
+			if (disable_timer_pin_1 > 0)
+				clear_IO_APIC_pin(0, pin1);
+			goto out;
+		}
+		if (intr_remapping_enabled)
+			panic("timer doesn't work through Interrupt-remapped IO-APIC");
+		local_irq_disable();
+		clear_IO_APIC_pin(apic1, pin1);
+		if (!no_pin1)
+			apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+				    "8254 timer not connected to IO-APIC\n");
+
+		apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+			    "(IRQ0) through the 8259A ...\n");
+		apic_printk(APIC_QUIET, KERN_INFO
+			    "..... (found apic %d pin %d) ...\n", apic2, pin2);
+		/*
+		 * legacy devices should be connected to IO APIC #0
+		 */
+		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
+		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+		legacy_pic->unmask(0);
+		if (timer_irq_works()) {
+			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+			timer_through_8259 = 1;
+			goto out;
+		}
+		/*
+		 * Cleanup, just in case ...
+		 */
+		local_irq_disable();
+		legacy_pic->mask(0);
+		clear_IO_APIC_pin(apic2, pin2);
+		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+	}
+
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as Virtual Wire IRQ...\n");
+
+	lapic_register_intr(0);
+	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
+	legacy_pic->unmask(0);
+
+	if (timer_irq_works()) {
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+		goto out;
+	}
+	local_irq_disable();
+	legacy_pic->mask(0);
+	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as ExtINT IRQ...\n");
+
+	legacy_pic->init(0);
+	legacy_pic->make_irq(0);
+	apic_write(APIC_LVT0, APIC_DM_EXTINT);
+
+	unlock_ExtINT_logic();
+
+	if (timer_irq_works()) {
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+		goto out;
+	}
+	local_irq_disable();
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+	if (x2apic_preenabled)
+		apic_printk(APIC_QUIET, KERN_INFO
+			    "Perhaps problem with the pre-enabled x2apic mode\n"
+			    "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+		"report.  Then try booting with the 'noapic' option.\n");
+out:
+	local_irq_restore(flags);
+}
+#else
+#define check_timer() ((void)0)
+#endif
+
+/*
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices.  However there may be an I/O APIC pin available for
+ * this interrupt regardless.  The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A.  In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table.  With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default.  We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor.  Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now.  No actual device should request
+ * it anyway.  --macro
+ */
+#define PIC_IRQS	(1UL << PIC_CASCADE_IR)
+
+void __init setup_IO_APIC(void)
+{
+
+	/*
+	 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
+	 */
+	io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
+
+	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+	/*
+         * Set up IO-APIC IRQ routing.
+         */
+#ifndef CONFIG_XEN
+	x86_init.mpparse.setup_ioapic_ids();
+
+	sync_Arb_IDs();
+#endif
+	setup_IO_APIC_irqs();
+	init_IO_APIC_traps();
+	if (legacy_pic->nr_legacy_irqs)
+		check_timer();
+}
+
+/*
+ *      Called after all the initialization is done. If we didn't find any
+ *      APIC bugs then we can allow the modify fast path
+ */
+
+static int __init io_apic_bug_finalize(void)
+{
+	if (sis_apic_bug == -1)
+		sis_apic_bug = 0;
+#ifdef CONFIG_X86_XEN
+	if (is_initial_xendomain()) {
+		struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
+		op.u.platform_quirk.quirk_id = sis_apic_bug ?
+			QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
+		VOID(HYPERVISOR_platform_op(&op));
+	}
+#endif
+	return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
+#ifndef CONFIG_XEN
+static void resume_ioapic_id(int ioapic_idx)
+{
+	unsigned long flags;
+	union IO_APIC_reg_00 reg_00;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(ioapic_idx, 0);
+	if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
+		reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+		io_apic_write(ioapic_idx, 0, reg_00.raw);
+	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void ioapic_resume(void)
+{
+	int ioapic_idx;
+
+	for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--)
+		resume_ioapic_id(ioapic_idx);
+
+	restore_ioapic_entries();
+}
+
+static struct syscore_ops ioapic_syscore_ops = {
+	.suspend = save_ioapic_entries,
+	.resume = ioapic_resume,
+};
+
+static int __init ioapic_init_ops(void)
+{
+	register_syscore_ops(&ioapic_syscore_ops);
+
+	return 0;
+}
+
+device_initcall(ioapic_init_ops);
+
+/*
+ * Dynamic irq allocate and deallocation
+ */
+unsigned int create_irq_nr(unsigned int from, int node)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int ret = 0;
+	int irq;
+
+	if (from < nr_irqs_gsi)
+		from = nr_irqs_gsi;
+
+	irq = alloc_irq_from(from, node);
+	if (irq < 0)
+		return 0;
+	cfg = alloc_irq_cfg(irq, node);
+	if (!cfg) {
+		free_irq_at(irq, NULL);
+		return 0;
+	}
+
+	raw_spin_lock_irqsave(&vector_lock, flags);
+	if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+		ret = irq;
+	raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+	if (ret) {
+		irq_set_chip_data(irq, cfg);
+		irq_clear_status_flags(irq, IRQ_NOREQUEST);
+	} else {
+		free_irq_at(irq, cfg);
+	}
+	return ret;
+}
+
+int create_irq(void)
+{
+	int node = cpu_to_node(0);
+	unsigned int irq_want;
+	int irq;
+
+	irq_want = nr_irqs_gsi;
+	irq = create_irq_nr(irq_want, node);
+
+	if (irq == 0)
+		irq = -1;
+
+	return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+	struct irq_cfg *cfg = irq_get_chip_data(irq);
+	unsigned long flags;
+
+	irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
+
+	if (irq_remapped(cfg))
+		free_irte(irq);
+	raw_spin_lock_irqsave(&vector_lock, flags);
+	__clear_irq_vector(irq, cfg);
+	raw_spin_unlock_irqrestore(&vector_lock, flags);
+	free_irq_at(irq, cfg);
+}
+#endif /* !CONFIG_XEN */
+
+/*
+ * MSI message composition
+ */
+#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+			   struct msi_msg *msg, u8 hpet_id)
+{
+	struct irq_cfg *cfg;
+	int err;
+	unsigned dest;
+
+	if (disable_apic)
+		return -ENXIO;
+
+	cfg = irq_cfg(irq);
+	err = assign_irq_vector(irq, cfg, apic->target_cpus());
+	if (err)
+		return err;
+
+	dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+
+	if (irq_remapped(cfg)) {
+		struct irte irte;
+		int ir_index;
+		u16 sub_handle;
+
+		ir_index = map_irq_to_irte_handle(irq, &sub_handle);
+		BUG_ON(ir_index == -1);
+
+		prepare_irte(&irte, cfg->vector, dest);
+
+		/* Set source-id of interrupt request */
+		if (pdev)
+			set_msi_sid(&irte, pdev);
+		else
+			set_hpet_sid(&irte, hpet_id);
+
+		modify_irte(irq, &irte);
+
+		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->data = sub_handle;
+		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+				  MSI_ADDR_IR_SHV |
+				  MSI_ADDR_IR_INDEX1(ir_index) |
+				  MSI_ADDR_IR_INDEX2(ir_index);
+	} else {
+		if (x2apic_enabled())
+			msg->address_hi = MSI_ADDR_BASE_HI |
+					  MSI_ADDR_EXT_DEST_ID(dest);
+		else
+			msg->address_hi = MSI_ADDR_BASE_HI;
+
+		msg->address_lo =
+			MSI_ADDR_BASE_LO |
+			((apic->irq_dest_mode == 0) ?
+				MSI_ADDR_DEST_MODE_PHYSICAL:
+				MSI_ADDR_DEST_MODE_LOGICAL) |
+			((apic->irq_delivery_mode != dest_LowestPrio) ?
+				MSI_ADDR_REDIRECTION_CPU:
+				MSI_ADDR_REDIRECTION_LOWPRI) |
+			MSI_ADDR_DEST_ID(dest);
+
+		msg->data =
+			MSI_DATA_TRIGGER_EDGE |
+			MSI_DATA_LEVEL_ASSERT |
+			((apic->irq_delivery_mode != dest_LowestPrio) ?
+				MSI_DATA_DELIVERY_FIXED:
+				MSI_DATA_DELIVERY_LOWPRI) |
+			MSI_DATA_VECTOR(cfg->vector);
+	}
+	return err;
+}
+
+#ifdef CONFIG_SMP
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	struct msi_msg msg;
+	unsigned int dest;
+
+	if (__ioapic_set_affinity(data, mask, &dest))
+		return -1;
+
+	__get_cached_msi_msg(data->msi_desc, &msg);
+
+	msg.data &= ~MSI_DATA_VECTOR_MASK;
+	msg.data |= MSI_DATA_VECTOR(cfg->vector);
+	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+	__write_msi_msg(data->msi_desc, &msg);
+
+	return 0;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+	.name			= "PCI-MSI",
+	.irq_unmask		= unmask_msi_irq,
+	.irq_mask		= mask_msi_irq,
+	.irq_ack		= ack_apic_edge,
+#ifdef CONFIG_SMP
+	.irq_set_affinity	= msi_set_affinity,
+#endif
+	.irq_retrigger		= ioapic_retrigger_irq,
+};
+
+/*
+ * Map the PCI dev to the corresponding remapping hardware unit
+ * and allocate 'nvec' consecutive interrupt-remapping table entries
+ * in it.
+ */
+static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
+{
+	struct intel_iommu *iommu;
+	int index;
+
+	iommu = map_dev_to_ir(dev);
+	if (!iommu) {
+		printk(KERN_ERR
+		       "Unable to map PCI %s to iommu\n", pci_name(dev));
+		return -ENOENT;
+	}
+
+	index = alloc_irte(iommu, irq, nvec);
+	if (index < 0) {
+		printk(KERN_ERR
+		       "Unable to allocate %d IRTE for PCI %s\n", nvec,
+		       pci_name(dev));
+		return -ENOSPC;
+	}
+	return index;
+}
+
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+{
+	struct irq_chip *chip = &msi_chip;
+	struct msi_msg msg;
+	int ret;
+
+	ret = msi_compose_msg(dev, irq, &msg, -1);
+	if (ret < 0)
+		return ret;
+
+	irq_set_msi_desc(irq, msidesc);
+	write_msi_msg(irq, &msg);
+
+	if (irq_remapped(irq_get_chip_data(irq))) {
+		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+		irq_remap_modify_chip_defaults(chip);
+	}
+
+	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+
+	return 0;
+}
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int node, ret, sub_handle, index = 0;
+	unsigned int irq, irq_want;
+	struct msi_desc *msidesc;
+	struct intel_iommu *iommu = NULL;
+
+	/* x86 doesn't support multiple MSI yet */
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+
+	node = dev_to_node(&dev->dev);
+	irq_want = nr_irqs_gsi;
+	sub_handle = 0;
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		irq = create_irq_nr(irq_want, node);
+		if (irq == 0)
+			return -1;
+		irq_want = irq + 1;
+		if (!intr_remapping_enabled)
+			goto no_ir;
+
+		if (!sub_handle) {
+			/*
+			 * allocate the consecutive block of IRTE's
+			 * for 'nvec'
+			 */
+			index = msi_alloc_irte(dev, irq, nvec);
+			if (index < 0) {
+				ret = index;
+				goto error;
+			}
+		} else {
+			iommu = map_dev_to_ir(dev);
+			if (!iommu) {
+				ret = -ENOENT;
+				goto error;
+			}
+			/*
+			 * setup the mapping between the irq and the IRTE
+			 * base index, the sub_handle pointing to the
+			 * appropriate interrupt remap table entry.
+			 */
+			set_irte_irq(irq, iommu, index, sub_handle);
+		}
+no_ir:
+		ret = setup_msi_irq(dev, msidesc, irq);
+		if (ret < 0)
+			goto error;
+		sub_handle++;
+	}
+	return 0;
+
+error:
+	destroy_irq(irq);
+	return ret;
+}
+
+void native_teardown_msi_irq(unsigned int irq)
+{
+	destroy_irq(irq);
+}
+
+#ifdef CONFIG_DMAR_TABLE
+#ifdef CONFIG_SMP
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+		      bool force)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int dest, irq = data->irq;
+	struct msi_msg msg;
+
+	if (__ioapic_set_affinity(data, mask, &dest))
+		return -1;
+
+	dmar_msi_read(irq, &msg);
+
+	msg.data &= ~MSI_DATA_VECTOR_MASK;
+	msg.data |= MSI_DATA_VECTOR(cfg->vector);
+	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+	msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
+
+	dmar_msi_write(irq, &msg);
+
+	return 0;
+}
+
+#endif /* CONFIG_SMP */
+
+static struct irq_chip dmar_msi_type = {
+	.name			= "DMAR_MSI",
+	.irq_unmask		= dmar_msi_unmask,
+	.irq_mask		= dmar_msi_mask,
+	.irq_ack		= ack_apic_edge,
+#ifdef CONFIG_SMP
+	.irq_set_affinity	= dmar_msi_set_affinity,
+#endif
+	.irq_retrigger		= ioapic_retrigger_irq,
+};
+
+int arch_setup_dmar_msi(unsigned int irq)
+{
+	int ret;
+	struct msi_msg msg;
+
+	ret = msi_compose_msg(NULL, irq, &msg, -1);
+	if (ret < 0)
+		return ret;
+	dmar_msi_write(irq, &msg);
+	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+				      "edge");
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_HPET_TIMER
+
+#ifdef CONFIG_SMP
+static int hpet_msi_set_affinity(struct irq_data *data,
+				 const struct cpumask *mask, bool force)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	struct msi_msg msg;
+	unsigned int dest;
+
+	if (__ioapic_set_affinity(data, mask, &dest))
+		return -1;
+
+	hpet_msi_read(data->handler_data, &msg);
+
+	msg.data &= ~MSI_DATA_VECTOR_MASK;
+	msg.data |= MSI_DATA_VECTOR(cfg->vector);
+	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+	hpet_msi_write(data->handler_data, &msg);
+
+	return 0;
+}
+
+#endif /* CONFIG_SMP */
+
+static struct irq_chip hpet_msi_type = {
+	.name = "HPET_MSI",
+	.irq_unmask = hpet_msi_unmask,
+	.irq_mask = hpet_msi_mask,
+	.irq_ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+	.irq_set_affinity = hpet_msi_set_affinity,
+#endif
+	.irq_retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
+{
+	struct irq_chip *chip = &hpet_msi_type;
+	struct msi_msg msg;
+	int ret;
+
+	if (intr_remapping_enabled) {
+		struct intel_iommu *iommu = map_hpet_to_ir(id);
+		int index;
+
+		if (!iommu)
+			return -1;
+
+		index = alloc_irte(iommu, irq, 1);
+		if (index < 0)
+			return -1;
+	}
+
+	ret = msi_compose_msg(NULL, irq, &msg, id);
+	if (ret < 0)
+		return ret;
+
+	hpet_msi_write(irq_get_handler_data(irq), &msg);
+	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+	if (irq_remapped(irq_get_chip_data(irq)))
+		irq_remap_modify_chip_defaults(chip);
+
+	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+	return 0;
+}
+#endif
+
+#endif /* CONFIG_PCI_MSI */
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+	struct ht_irq_msg msg;
+	fetch_ht_irq_msg(irq, &msg);
+
+	msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
+	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+	msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
+	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+	write_ht_irq_msg(irq, &msg);
+}
+
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+{
+	struct irq_cfg *cfg = data->chip_data;
+	unsigned int dest;
+
+	if (__ioapic_set_affinity(data, mask, &dest))
+		return -1;
+
+	target_ht_irq(data->irq, dest, cfg->vector);
+	return 0;
+}
+
+#endif
+
+static struct irq_chip ht_irq_chip = {
+	.name			= "PCI-HT",
+	.irq_mask		= mask_ht_irq,
+	.irq_unmask		= unmask_ht_irq,
+	.irq_ack		= ack_apic_edge,
+#ifdef CONFIG_SMP
+	.irq_set_affinity	= ht_set_affinity,
+#endif
+	.irq_retrigger		= ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+	struct irq_cfg *cfg;
+	int err;
+
+	if (disable_apic)
+		return -ENXIO;
+
+	cfg = irq_cfg(irq);
+	err = assign_irq_vector(irq, cfg, apic->target_cpus());
+	if (!err) {
+		struct ht_irq_msg msg;
+		unsigned dest;
+
+		dest = apic->cpu_mask_to_apicid_and(cfg->domain,
+						    apic->target_cpus());
+
+		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+		msg.address_lo =
+			HT_IRQ_LOW_BASE |
+			HT_IRQ_LOW_DEST_ID(dest) |
+			HT_IRQ_LOW_VECTOR(cfg->vector) |
+			((apic->irq_dest_mode == 0) ?
+				HT_IRQ_LOW_DM_PHYSICAL :
+				HT_IRQ_LOW_DM_LOGICAL) |
+			HT_IRQ_LOW_RQEOI_EDGE |
+			((apic->irq_delivery_mode != dest_LowestPrio) ?
+				HT_IRQ_LOW_MT_FIXED :
+				HT_IRQ_LOW_MT_ARBITRATED) |
+			HT_IRQ_LOW_IRQ_MASKED;
+
+		write_ht_irq_msg(irq, &msg);
+
+		irq_set_chip_and_handler_name(irq, &ht_irq_chip,
+					      handle_edge_irq, "edge");
+
+		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+	}
+	return err;
+}
+#endif /* CONFIG_HT_IRQ */
+
+static int
+io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
+{
+	struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
+	int ret;
+
+	if (!cfg)
+		return -EINVAL;
+	ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
+	if (!ret)
+		setup_ioapic_irq(irq, cfg, attr);
+	return ret;
+}
+
+int io_apic_setup_irq_pin_once(unsigned int irq, int node,
+			       struct io_apic_irq_attr *attr)
+{
+	unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
+	int ret;
+
+	/* Avoid redundant programming */
+	if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
+		pr_debug("Pin %d-%d already programmed\n",
+			 mpc_ioapic_id(ioapic_idx), pin);
+		return 0;
+	}
+	ret = io_apic_setup_irq_pin(irq, node, attr);
+	if (!ret)
+		set_bit(pin, ioapics[ioapic_idx].pin_programmed);
+	return ret;
+}
+
+static int __init io_apic_get_redir_entries(int ioapic)
+{
+	union IO_APIC_reg_01	reg_01;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	reg_01.raw = io_apic_read(ioapic, 1);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	/* The register returns the maximum index redir index
+	 * supported, which is one less than the total number of redir
+	 * entries.
+	 */
+	return reg_01.bits.entries + 1;
+}
+
+#ifndef CONFIG_XEN
+static void __init probe_nr_irqs_gsi(void)
+{
+	int nr;
+
+	nr = gsi_top + NR_IRQS_LEGACY;
+	if (nr > nr_irqs_gsi)
+		nr_irqs_gsi = nr;
+
+	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
+}
+
+int get_nr_irqs_gsi(void)
+{
+	return nr_irqs_gsi;
+}
+
+int __init arch_probe_nr_irqs(void)
+{
+	int nr;
+
+	if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+		nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+	nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+	/*
+	 * for MSI and HT dyn irq
+	 */
+	nr += nr_irqs_gsi * 16;
+#endif
+	if (nr < nr_irqs)
+		nr_irqs = nr;
+
+	return NR_IRQS_LEGACY;
+}
+#endif /* CONFIG_XEN */
+
+int io_apic_set_pci_routing(struct device *dev, int irq,
+			    struct io_apic_irq_attr *irq_attr)
+{
+	int node;
+
+#ifdef CONFIG_XEN
+	if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) {
+		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n",
+			    irq_attr->ioapic, irq);
+		return -EINVAL;
+	}
+#endif
+	if (!IO_APIC_IRQ(irq)) {
+		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+			    irq_attr->ioapic);
+		return -EINVAL;
+	}
+
+	node = dev ? dev_to_node(dev) : cpu_to_node(0);
+
+	return io_apic_setup_irq_pin_once(irq, node, irq_attr);
+}
+
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+static int __init io_apic_get_unique_id(int ioapic, int apic_id)
+{
+	union IO_APIC_reg_00 reg_00;
+	static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+	physid_mask_t tmp;
+	unsigned long flags;
+	int i = 0;
+
+	/*
+	 * The P4 platform supports up to 256 APIC IDs on two separate APIC
+	 * buses (one for LAPICs, one for IOAPICs), where predecessors only
+	 * supports up to 16 on one shared APIC bus.
+	 *
+	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+	 *      advantage of new APIC bus architecture.
+	 */
+
+	if (physids_empty(apic_id_map))
+		apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(ioapic, 0);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	if (apic_id >= get_physical_broadcast()) {
+		printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+			"%d\n", ioapic, apic_id, reg_00.bits.ID);
+		apic_id = reg_00.bits.ID;
+	}
+
+	/*
+	 * Every APIC in a system must have a unique ID or we get lots of nice
+	 * 'stuck on smp_invalidate_needed IPI wait' messages.
+	 */
+	if (apic->check_apicid_used(&apic_id_map, apic_id)) {
+
+		for (i = 0; i < get_physical_broadcast(); i++) {
+			if (!apic->check_apicid_used(&apic_id_map, i))
+				break;
+		}
+
+		if (i == get_physical_broadcast())
+			panic("Max apic_id exceeded!\n");
+
+		printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+			"trying %d\n", ioapic, apic_id, i);
+
+		apic_id = i;
+	}
+
+	apic->apicid_to_cpu_present(apic_id, &tmp);
+	physids_or(apic_id_map, apic_id_map, tmp);
+
+	if (reg_00.bits.ID != apic_id) {
+		reg_00.bits.ID = apic_id;
+
+		raw_spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(ioapic, 0, reg_00.raw);
+		reg_00.raw = io_apic_read(ioapic, 0);
+		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		/* Sanity check */
+		if (reg_00.bits.ID != apic_id) {
+			printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+			return -1;
+		}
+	}
+
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+	return apic_id;
+}
+#endif
+
+static u8 __init io_apic_unique_id(u8 id)
+{
+#ifndef CONFIG_XEN
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return io_apic_get_unique_id(nr_ioapics, id);
+	else
+#endif
+		return id;
+}
+#else
+static u8 __init io_apic_unique_id(u8 id)
+{
+	int i;
+	DECLARE_BITMAP(used, 256);
+
+	bitmap_zero(used, 256);
+	for (i = 0; i < nr_ioapics; i++) {
+		__set_bit(mpc_ioapic_id(i), used);
+	}
+	if (!test_bit(id, used))
+		return id;
+	return find_first_zero_bit(used, 256);
+}
+#endif
+
+static int __init io_apic_get_version(int ioapic)
+{
+	union IO_APIC_reg_01	reg_01;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	reg_01.raw = io_apic_read(ioapic, 1);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return reg_01.bits.version;
+}
+
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
+{
+	int ioapic, pin, idx;
+
+	if (skip_ioapic_setup)
+		return -1;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return -1;
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+	if (pin < 0)
+		return -1;
+
+	idx = find_irq_entry(ioapic, pin, mp_INT);
+	if (idx < 0)
+		return -1;
+
+	*trigger = irq_trigger(idx);
+	*polarity = irq_polarity(idx);
+	return 0;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be apic->target_cpus()
+ */
+#ifdef CONFIG_SMP
+void __init setup_ioapic_dest(void)
+{
+	int pin, ioapic, irq, irq_entry;
+	const struct cpumask *mask;
+	struct irq_data *idata;
+
+	if (skip_ioapic_setup == 1)
+		return;
+
+	for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
+	for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
+		irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+		if (irq_entry == -1)
+			continue;
+		irq = pin_2_irq(irq_entry, ioapic, pin);
+
+		if ((ioapic > 0) && (irq > 16))
+			continue;
+
+		idata = irq_get_irq_data(irq);
+
+		/*
+		 * Honour affinities which have been set in early boot
+		 */
+		if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
+			mask = idata->affinity;
+		else
+			mask = apic->target_cpus();
+
+		if (intr_remapping_enabled)
+			ir_ioapic_set_affinity(idata, mask, false);
+		else
+			ioapic_set_affinity(idata, mask, false);
+	}
+
+}
+#endif
+
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+
+static struct resource *ioapic_resources;
+
+static struct resource * __init ioapic_setup_resources(int nr_ioapics)
+{
+	unsigned long n;
+	struct resource *res;
+	char *mem;
+	int i;
+
+	if (nr_ioapics <= 0)
+		return NULL;
+
+	n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+	n *= nr_ioapics;
+
+	mem = alloc_bootmem(n);
+	res = (void *)mem;
+
+	mem += sizeof(struct resource) * nr_ioapics;
+
+	for (i = 0; i < nr_ioapics; i++) {
+		res[i].name = mem;
+		res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
+		mem += IOAPIC_RESOURCE_NAME_SIZE;
+	}
+
+	ioapic_resources = res;
+
+	return res;
+}
+
+void __init ioapic_and_gsi_init(void)
+{
+	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+	struct resource *ioapic_res;
+	int i;
+
+	ioapic_res = ioapic_setup_resources(nr_ioapics);
+	for (i = 0; i < nr_ioapics; i++) {
+		if (smp_found_config) {
+			ioapic_phys = mpc_ioapic_addr(i);
+#ifdef CONFIG_X86_32
+			if (!ioapic_phys) {
+				printk(KERN_ERR
+				       "WARNING: bogus zero IO-APIC "
+				       "address found in MPTABLE, "
+				       "disabling IO/APIC support!\n");
+				smp_found_config = 0;
+				skip_ioapic_setup = 1;
+				goto fake_ioapic_page;
+			}
+#endif
+		} else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
+			ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
+			ioapic_phys = __pa(ioapic_phys);
+		}
+		set_fixmap_nocache(idx, ioapic_phys);
+		apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
+			__fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
+			ioapic_phys);
+		idx++;
+
+		ioapic_res->start = ioapic_phys;
+		ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
+		ioapic_res++;
+	}
+
+	probe_nr_irqs_gsi();
+}
+
+void __init ioapic_insert_resources(void)
+{
+	int i;
+	struct resource *r = ioapic_resources;
+
+	if (!r) {
+		if (nr_ioapics > 0)
+			printk(KERN_ERR
+				"IO APIC resources couldn't be allocated.\n");
+		return;
+	}
+
+	for (i = 0; i < nr_ioapics; i++) {
+		insert_resource(&iomem_resource, r);
+		r++;
+	}
+}
+#endif /* !CONFIG_XEN */
+
+int mp_find_ioapic(u32 gsi)
+{
+	int i = 0;
+
+	if (nr_ioapics == 0)
+		return -1;
+
+	/* Find the IOAPIC that manages this GSI. */
+	for (i = 0; i < nr_ioapics; i++) {
+		struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+		if ((gsi >= gsi_cfg->gsi_base)
+		    && (gsi <= gsi_cfg->gsi_end))
+			return i;
+	}
+
+	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+	return -1;
+}
+
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
+{
+	struct mp_ioapic_gsi *gsi_cfg;
+
+	if (WARN_ON(ioapic == -1))
+		return -1;
+
+	gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+	if (WARN_ON(gsi > gsi_cfg->gsi_end))
+		return -1;
+
+	return gsi - gsi_cfg->gsi_base;
+}
+
+static __init int bad_ioapic(unsigned long address)
+{
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
+		       "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
+		return 1;
+	}
+	if (!address) {
+		printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
+		       " found in table, skipping!\n");
+		return 1;
+	}
+	return 0;
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+	int idx = 0;
+	int entries;
+	struct mp_ioapic_gsi *gsi_cfg;
+
+	if (bad_ioapic(address))
+		return;
+
+	idx = nr_ioapics;
+
+	ioapics[idx].mp_config.type = MP_IOAPIC;
+	ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+	ioapics[idx].mp_config.apicaddr = address;
+
+#ifndef CONFIG_XEN
+	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+#endif
+	ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
+	ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
+
+	/*
+	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+	 */
+	entries = io_apic_get_redir_entries(idx);
+	gsi_cfg = mp_ioapic_gsi_routing(idx);
+	gsi_cfg->gsi_base = gsi_base;
+	gsi_cfg->gsi_end = gsi_base + entries - 1;
+
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	ioapics[idx].nr_registers = entries;
+
+	if (gsi_cfg->gsi_end >= gsi_top)
+		gsi_top = gsi_cfg->gsi_end + 1;
+
+	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+	       "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
+	       mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+	       gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+
+	nr_ioapics++;
+}
+
+#ifdef CONFIG_X86_MRST
+/* Enable IOAPIC early just for system timer */
+void __init pre_init_apic_IRQ0(void)
+{
+	struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+
+	printk(KERN_INFO "Early APIC setup for system timer0\n");
+#ifndef CONFIG_SMP
+	physid_set_mask_of_physid(boot_cpu_physical_apicid,
+					 &phys_cpu_present_map);
+#endif
+	setup_local_APIC();
+
+	io_apic_setup_irq_pin(0, 0, &attr);
+	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+				      "edge");
+}
+#endif
diff --git a/arch/x86/kernel/apic/ipi-xen.c b/arch/x86/kernel/apic/ipi-xen.c
new file mode 100644
index 0000000..a3ee607
--- /dev/null
+++ b/arch/x86/kernel/apic/ipi-xen.c
@@ -0,0 +1,43 @@
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+
+#ifdef CONFIG_SMP
+#include <xen/evtchn.h>
+
+void xen_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
+{
+	unsigned int cpu, this_cpu = smp_processor_id();
+
+	WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
+	for_each_cpu_and(cpu, cpumask, cpu_online_mask)
+		if (cpu != this_cpu)
+			notify_remote_via_ipi(vector, cpu);
+}
+
+void xen_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+	unsigned int cpu;
+
+	WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
+	for_each_cpu_and(cpu, cpumask, cpu_online_mask)
+		notify_remote_via_ipi(vector, cpu);
+}
+
+void xen_send_IPI_allbutself(int vector)
+{
+	xen_send_IPI_mask_allbutself(cpu_online_mask, vector);
+}
+
+void xen_send_IPI_all(int vector)
+{
+	xen_send_IPI_mask(cpu_online_mask, vector);
+}
+
+void xen_send_IPI_self(int vector)
+{
+	notify_remote_via_ipi(vector, smp_processor_id());
+}
+#endif
diff --git a/arch/x86/kernel/apic/probe_32-xen.c b/arch/x86/kernel/apic/probe_32-xen.c
new file mode 100644
index 0000000..8602fa9
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_32-xen.c
@@ -0,0 +1,57 @@
+/*
+ * Default generic APIC driver. This handles up to 8 CPUs.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic x86 APIC driver probe layer.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820.h>
+
+static int xen_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+	return cpuid_apic;
+}
+
+static struct apic apic_xen = {
+
+	.name				= "default",
+
+	.irq_delivery_mode		= dest_LowestPrio,
+	/* logical delivery broadcast to all CPUs: */
+	.irq_dest_mode			= 1,
+
+	.target_cpus			= default_target_cpus,
+
+	.phys_pkg_id			= xen_phys_pkg_id,
+
+#ifdef CONFIG_SMP
+	.send_IPI_mask			= xen_send_IPI_mask,
+	.send_IPI_mask_allbutself	= xen_send_IPI_mask_allbutself,
+	.send_IPI_allbutself		= xen_send_IPI_allbutself,
+	.send_IPI_all			= xen_send_IPI_all,
+	.send_IPI_self			= xen_send_IPI_self,
+#endif
+};
+
+struct apic *apic = &apic_xen;
+EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 68de2dc..3d01fce 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,7 +17,7 @@
 #include <asm/bootparam.h>
 #include <asm/suspend.h>
 
-#ifdef CONFIG_XEN
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
 #include <xen/interface/xen.h>
 #endif
 
@@ -55,7 +55,7 @@ void common(void) {
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
 
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 	BLANK();
 	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
 	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 85d98ab..cf6cc38 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,7 +1,9 @@
 #include <asm/ucontext.h>
 
+#ifdef CONFIG_LGUEST_GUEST
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
+#endif
 
 #define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
 static char syscalls[] = {
@@ -60,9 +62,19 @@ void foo(void)
 	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
 	BLANK();
 
+#ifndef CONFIG_X86_NO_TSS
 	/* Offset from the sysenter stack to tss.sp0 */
-	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
+	DEFINE(SYSENTER_stack_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
 		 sizeof(struct tss_struct));
+#else
+	/* sysenter stack points directly to sp0 */
+	DEFINE(SYSENTER_stack_sp0, 0);
+#endif
+
+#ifdef CONFIG_XEN
+	BLANK();
+	OFFSET(XEN_START_mfn_list, start_info, mfn_list);
+#endif
 
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 834e897..8d13cc2 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -70,8 +70,10 @@ int main(void)
 	BLANK();
 #undef ENTRY
 
+#ifndef CONFIG_X86_NO_TSS
 	OFFSET(TSS_ist, tss_struct, x86_tss.ist);
 	BLANK();
+#endif
 
 	DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
 	DEFINE(NR_syscalls, sizeof(syscalls_64));
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6ab6aa2..fefe4b9 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -40,6 +40,9 @@ obj-$(CONFIG_MTRR)			+= mtrr/
 
 obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o perf_event_amd_ibs.o
 
+disabled-obj-$(CONFIG_XEN) := hypervisor.o mshyperv.o perfctr-watchdog.o \
+			      perf_event.o perf_event_%.o sched.o vmware.o
+
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f4773f4..a5b9661 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -333,7 +333,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
 int amd_get_nb_id(int cpu)
 {
 	int id = 0;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	id = per_cpu(cpu_llc_id, cpu);
 #endif
 	return id;
@@ -467,7 +467,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 		    (c->x86_model == 8 && c->x86_mask >= 8))
 			set_cpu_cap(c, X86_FEATURE_K6_MTRR);
 #endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) && !defined(CONFIG_XEN)
 	/* check CPU config space for extended APIC ID */
 	if (cpu_has_apic && c->x86 >= 0xf) {
 		unsigned int val;
@@ -480,7 +480,9 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 {
+#ifndef CONFIG_XEN
 	u32 dummy;
+#endif
 
 #ifdef CONFIG_SMP
 	unsigned long long value;
@@ -525,18 +527,26 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 			u64 val;
 
 			clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+#ifndef CONFIG_XEN
 			if (!rdmsrl_amd_safe(0xc001100d, &val)) {
 				val &= ~(1ULL << 32);
 				wrmsrl_amd_safe(0xc001100d, val);
 			}
+#else
+			pr_warning("Long-mode LAHF feature wrongly enabled -"
+				   "hypervisor update needed\n");
+			(void)&val;
+#endif
 		}
 
 	}
 	if (c->x86 >= 0x10)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 
+#ifndef CONFIG_XEN
 	/* get apicid instead of initial apic id from cpuid */
 	c->apicid = hard_smp_processor_id();
+#endif
 #else
 
 	/*
@@ -612,6 +622,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		fam10h_check_enable_mmcfg();
 	}
 
+#ifndef CONFIG_XEN
 	if (c == &boot_cpu_data && c->x86 >= 0xf) {
 		unsigned long long tseg;
 
@@ -631,6 +642,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		}
 	}
 #endif
+#endif
 
 	/*
 	 * Family 0x12 and above processors have APIC timer
@@ -639,6 +651,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 > 0x11)
 		set_cpu_cap(c, X86_FEATURE_ARAT);
 
+#ifndef CONFIG_XEN
 	/*
 	 * Disable GART TLB Walk Errors on Fam10h. We do this here
 	 * because this is always needed when GART is enabled, even in a
@@ -662,6 +675,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	}
 
 	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+#endif
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 46674fb..09087bd2 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -17,6 +17,7 @@
 #include <asm/paravirt.h>
 #include <asm/alternative.h>
 
+#ifndef CONFIG_XEN
 static int __init no_halt(char *s)
 {
 	WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
@@ -25,6 +26,7 @@ static int __init no_halt(char *s)
 }
 
 __setup("no-hlt", no_halt);
+#endif
 
 static int __init no_387(char *s)
 {
@@ -84,13 +86,16 @@ static void __init check_fpu(void)
 
 	kernel_fpu_end();
 
+#ifndef CONFIG_XEN
 	boot_cpu_data.fdiv_bug = fdiv_bug;
 	if (boot_cpu_data.fdiv_bug)
 		printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
+#endif
 }
 
 static void __init check_hlt(void)
 {
+#ifndef CONFIG_XEN
 	if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
 		return;
 
@@ -104,6 +109,7 @@ static void __init check_hlt(void)
 	halt();
 	halt();
 	printk(KERN_CONT "OK.\n");
+#endif
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 04f0fe5..25a2cda 100644
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -20,6 +20,7 @@ void __init check_bugs(void)
 #endif
 	alternative_instructions();
 
+#ifndef CONFIG_XEN
 	/*
 	 * Make sure the first 2MB area is not mapped by huge pages
 	 * There are typically fixed size MTRRs in there and overlapping
@@ -30,4 +31,5 @@ void __init check_bugs(void)
 	 */
 	if (!direct_gbpages)
 		set_memory_4k((unsigned long)__va(0), 1);
+#endif
 }
diff --git a/arch/x86/kernel/cpu/common-xen.c b/arch/x86/kernel/cpu/common-xen.c
new file mode 100644
index 0000000..05e2af1
--- /dev/null
+++ b/arch/x86/kernel/cpu/common-xen.c
@@ -0,0 +1,1435 @@
+#include <linux/bootmem.h>
+#include <linux/linkage.h>
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#include <asm/stackprotector.h>
+#include <asm/perf_event.h>
+#include <asm/mmu_context.h>
+#include <asm/archrandom.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/sections.h>
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <asm/pgtable.h>
+#include <linux/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/mtrr.h>
+#include <linux/numa.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/uv/uv.h>
+#endif
+
+#ifdef CONFIG_XEN
+#include <xen/interface/callback.h>
+#endif
+
+#include "cpu.h"
+
+/* all of these masks are initialized in setup_cpu_local_masks() */
+cpumask_var_t cpu_initialized_mask;
+#ifndef CONFIG_XEN
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
+
+/* representing cpus for which sibling maps can be computed */
+cpumask_var_t cpu_sibling_setup_mask;
+#endif
+
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+	alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+#ifndef CONFIG_XEN
+	alloc_bootmem_cpumask_var(&cpu_callin_mask);
+	alloc_bootmem_cpumask_var(&cpu_callout_mask);
+	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+#endif
+}
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+	cpu_detect_cache_sizes(c);
+#else
+	/* Not much we can do here... */
+	/* Check if at least it has cpuid */
+	if (c->cpuid_level == -1) {
+		/* No cpuid. It must be an ancient CPU */
+		if (c->x86 == 4)
+			strcpy(c->x86_model_id, "486");
+		else if (c->x86 == 3)
+			strcpy(c->x86_model_id, "386");
+	}
+#endif
+}
+
+static const struct cpu_dev __cpuinitconst default_cpu = {
+	.c_init		= default_init,
+	.c_vendor	= "Unknown",
+	.c_x86_vendor	= X86_VENDOR_UNKNOWN,
+};
+
+static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#ifdef CONFIG_X86_64
+	/*
+	 * We need valid kernel segments for data and code in long mode too
+	 * IRET will check the segment types  kkeil 2000/10/28
+	 * Also sysret mandates a special GDT layout
+	 *
+	 * TLS descriptors are currently at a different place compared to i386.
+	 * Hopefully nobody expects them at a fixed place (Wine?)
+	 */
+	[GDT_ENTRY_KERNEL32_CS]		= GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+	[GDT_ENTRY_KERNEL_CS]		= GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+	[GDT_ENTRY_KERNEL_DS]		= GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+	[GDT_ENTRY_DEFAULT_USER32_CS]	= GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
+	[GDT_ENTRY_DEFAULT_USER_DS]	= GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
+	[GDT_ENTRY_DEFAULT_USER_CS]	= GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+#else
+	[GDT_ENTRY_KERNEL_CS]		= GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
+	[GDT_ENTRY_KERNEL_DS]		= GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+	[GDT_ENTRY_DEFAULT_USER_CS]	= GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
+	[GDT_ENTRY_DEFAULT_USER_DS]	= GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+#ifndef CONFIG_XEN
+	/*
+	 * Segments used for calling PnP BIOS have byte granularity.
+	 * They code segments and data segments have fixed 64k limits,
+	 * the transfer segment sizes are set at run time.
+	 */
+	/* 32-bit code */
+	[GDT_ENTRY_PNPBIOS_CS32]	= GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+	/* 16-bit code */
+	[GDT_ENTRY_PNPBIOS_CS16]	= GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+	/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_DS]		= GDT_ENTRY_INIT(0x0092, 0, 0xffff),
+	/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS1]		= GDT_ENTRY_INIT(0x0092, 0, 0),
+	/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS2]		= GDT_ENTRY_INIT(0x0092, 0, 0),
+	/*
+	 * The APM segments have byte granularity and their bases
+	 * are set at run time.  All have 64k limits.
+	 */
+	/* 32-bit code */
+	[GDT_ENTRY_APMBIOS_BASE]	= GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+	/* 16-bit code */
+	[GDT_ENTRY_APMBIOS_BASE+1]	= GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+	/* data */
+	[GDT_ENTRY_APMBIOS_BASE+2]	= GDT_ENTRY_INIT(0x4092, 0, 0xffff),
+
+	[GDT_ENTRY_ESPFIX_SS]		= GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+#endif
+	[GDT_ENTRY_PERCPU]		= GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+	GDT_STACK_CANARY_INIT
+#endif
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+static int __init x86_xsave_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	return 1;
+}
+__setup("noxsave", x86_xsave_setup);
+
+static int __init x86_xsaveopt_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	return 1;
+}
+__setup("noxsaveopt", x86_xsaveopt_setup);
+
+#ifdef CONFIG_X86_32
+static int cachesize_override __cpuinitdata = -1;
+
+static int __init cachesize_setup(char *str)
+{
+	get_option(&str, &cachesize_override);
+	return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
+static int __init x86_fxsr_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_FXSR);
+	setup_clear_cpu_cap(X86_FEATURE_XMM);
+	return 1;
+}
+__setup("nofxsr", x86_fxsr_setup);
+
+static int __init x86_sep_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_SEP);
+	return 1;
+}
+__setup("nosep", x86_sep_setup);
+#endif
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+	u32 f1, f2;
+
+	/*
+	 * Cyrix and IDT cpus allow disabling of CPUID
+	 * so the code below may return different results
+	 * when it is executed before and after enabling
+	 * the CPUID. Add "volatile" to not allow gcc to
+	 * optimize the subsequent calls to this function.
+	 */
+	asm volatile ("pushfl		\n\t"
+		      "pushfl		\n\t"
+		      "popl %0		\n\t"
+		      "movl %0, %1	\n\t"
+		      "xorl %2, %0	\n\t"
+		      "pushl %0		\n\t"
+		      "popfl		\n\t"
+		      "pushfl		\n\t"
+		      "popl %0		\n\t"
+		      "popfl		\n\t"
+
+		      : "=&r" (f1), "=&r" (f2)
+		      : "ir" (flag));
+
+	return ((f1^f2) & flag) != 0;
+}
+
+/* Probe for the CPUID instruction */
+static int __cpuinit have_cpuid_p(void)
+{
+	return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+
+static int disable_x86_serial_nr __cpuinitdata = 1;
+
+static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+	unsigned long lo, hi;
+
+	if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+		return;
+
+	/* Disable processor serial number: */
+
+	rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+	lo |= 0x200000;
+	wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+	printk(KERN_NOTICE "CPU serial number disabled.\n");
+	clear_cpu_cap(c, X86_FEATURE_PN);
+
+	/* Disabling the serial number may affect the cpuid level */
+	c->cpuid_level = cpuid_eax(0);
+}
+
+static int __init x86_serial_nr_setup(char *s)
+{
+	disable_x86_serial_nr = 0;
+	return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+#else
+static inline int flag_is_changeable_p(u32 flag)
+{
+	return 1;
+}
+/* Probe for the CPUID instruction */
+static inline int have_cpuid_p(void)
+{
+	return 1;
+}
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+static int disable_smep __cpuinitdata;
+static __init int setup_disable_smep(char *arg)
+{
+	disable_smep = 1;
+	return 1;
+}
+__setup("nosmep", setup_disable_smep);
+
+static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_SMEP)) {
+		if (unlikely(disable_smep)) {
+			setup_clear_cpu_cap(X86_FEATURE_SMEP);
+			clear_in_cr4(X86_CR4_SMEP);
+		} else
+			set_in_cr4(X86_CR4_SMEP);
+	}
+}
+
+/*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+ * software.  Add those features to this table to auto-disable them.
+ */
+struct cpuid_dependent_feature {
+	u32 feature;
+	u32 level;
+};
+
+static const struct cpuid_dependent_feature __cpuinitconst
+cpuid_dependent_features[] = {
+	{ X86_FEATURE_MWAIT,		0x00000005 },
+	{ X86_FEATURE_DCA,		0x00000009 },
+	{ X86_FEATURE_XSAVE,		0x0000000d },
+	{ 0, 0 }
+};
+
+static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+{
+	const struct cpuid_dependent_feature *df;
+
+	for (df = cpuid_dependent_features; df->feature; df++) {
+
+		if (!cpu_has(c, df->feature))
+			continue;
+		/*
+		 * Note: cpuid_level is set to -1 if unavailable, but
+		 * extended_extended_level is set to 0 if unavailable
+		 * and the legitimate extended levels are all negative
+		 * when signed; hence the weird messing around with
+		 * signs here...
+		 */
+		if (!((s32)df->level < 0 ?
+		     (u32)df->level > (u32)c->extended_cpuid_level :
+		     (s32)df->level > (s32)c->cpuid_level))
+			continue;
+
+		clear_cpu_cap(c, df->feature);
+		if (!warn)
+			continue;
+
+		printk(KERN_WARNING
+		       "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+				x86_cap_flags[df->feature], df->level);
+	}
+}
+
+/*
+ * Naming convention should be: <Name> [(<Codename>)]
+ * This table only is used unless init_<vendor>() below doesn't set it;
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
+ */
+
+/* Look up CPU names by table lookup. */
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
+{
+	const struct cpu_model_info *info;
+
+	if (c->x86_model >= 16)
+		return NULL;	/* Range check */
+
+	if (!this_cpu)
+		return NULL;
+
+	info = this_cpu->c_models;
+
+	while (info && info->family) {
+		if (info->family == c->x86)
+			return info->model_names[c->x86_model];
+		info++;
+	}
+	return NULL;		/* Not found */
+}
+
+__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
+
+void __ref load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+	static bool done;
+
+	if (!done) {
+		done = true;
+		adjust_boot_vcpu_info();
+	}
+#endif
+#ifdef CONFIG_X86_32
+	loadsegment(fs, __KERNEL_PERCPU);
+#else
+	loadsegment(gs, 0);
+#ifndef CONFIG_XEN
+	wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+#else
+	if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
+			(unsigned long)per_cpu(irq_stack_union.gs_base, cpu)))
+		BUG();
+#endif
+#endif
+	load_stack_canary_segment();
+}
+
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
+void switch_to_new_gdt(int cpu)
+{
+	struct desc_ptr gdt_descr;
+	unsigned long va, frames[16];
+	int f;
+
+	gdt_descr.address = (long)get_cpu_gdt_table(cpu);
+	gdt_descr.size = GDT_SIZE - 1;
+
+	for (va = gdt_descr.address, f = 0;
+	     va < gdt_descr.address + gdt_descr.size;
+	     va += PAGE_SIZE, f++) {
+		frames[f] = arbitrary_virt_to_mfn(va);
+		make_page_readonly((void *)va,
+				   XENFEAT_writable_descriptor_tables);
+	}
+	if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
+		BUG();
+
+	/* Reload the per-cpu base */
+
+	load_percpu_segment(cpu);
+}
+
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+	unsigned int *v;
+	char *p, *q;
+
+	if (c->extended_cpuid_level < 0x80000004)
+		return;
+
+	v = (unsigned int *)c->x86_model_id;
+	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+	c->x86_model_id[48] = 0;
+
+	/*
+	 * Intel chips right-justify this string for some dumb reason;
+	 * undo that brain damage:
+	 */
+	p = q = &c->x86_model_id[0];
+	while (*p == ' ')
+		p++;
+	if (p != q) {
+		while (*p)
+			*q++ = *p++;
+		while (q <= &c->x86_model_id[48])
+			*q++ = '\0';	/* Zero-pad the rest */
+	}
+}
+
+void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
+{
+	unsigned int n, dummy, ebx, ecx, edx, l2size;
+
+	n = c->extended_cpuid_level;
+
+	if (n >= 0x80000005) {
+		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+		c->x86_cache_size = (ecx>>24) + (edx>>24);
+#ifdef CONFIG_X86_64
+		/* On K8 L1 TLB is inclusive, so don't count it */
+		c->x86_tlbsize = 0;
+#endif
+	}
+
+	if (n < 0x80000006)	/* Some chips just has a large L1. */
+		return;
+
+	cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+	l2size = ecx >> 16;
+
+#ifdef CONFIG_X86_64
+	c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+#else
+	/* do processor-specific cache resizing */
+	if (this_cpu->c_size_cache)
+		l2size = this_cpu->c_size_cache(c, l2size);
+
+	/* Allow user to override all this if necessary. */
+	if (cachesize_override != -1)
+		l2size = cachesize_override;
+
+	if (l2size == 0)
+		return;		/* Again, no L2 cache is possible */
+#endif
+
+	c->x86_cache_size = l2size;
+}
+
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_HT
+	u32 eax, ebx, ecx, edx;
+	int index_msb, core_bits;
+	static bool printed;
+
+	if (!cpu_has(c, X86_FEATURE_HT))
+		return;
+
+	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+		goto out;
+
+	if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
+		return;
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+
+	smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+	if (smp_num_siblings == 1) {
+		printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
+		goto out;
+	}
+
+	if (smp_num_siblings <= 1)
+		goto out;
+
+	index_msb = get_count_order(smp_num_siblings);
+	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+
+	smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+	index_msb = get_count_order(smp_num_siblings);
+
+	core_bits = get_count_order(c->x86_max_cores);
+
+	c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+				       ((1 << core_bits) - 1);
+
+out:
+	if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       c->phys_proc_id);
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+		       c->cpu_core_id);
+		printed = 1;
+	}
+#endif
+}
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+	char *v = c->x86_vendor_id;
+	int i;
+
+	for (i = 0; i < X86_VENDOR_NUM; i++) {
+		if (!cpu_devs[i])
+			break;
+
+		if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+		    (cpu_devs[i]->c_ident[1] &&
+		     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
+			this_cpu = cpu_devs[i];
+			c->x86_vendor = this_cpu->c_x86_vendor;
+			return;
+		}
+	}
+
+	printk_once(KERN_ERR
+			"CPU: vendor_id '%s' unknown, using generic init.\n" \
+			"CPU: Your system may be unstable.\n", v);
+
+	c->x86_vendor = X86_VENDOR_UNKNOWN;
+	this_cpu = &default_cpu;
+}
+
+void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
+{
+	/* Get vendor name */
+	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+	      (unsigned int *)&c->x86_vendor_id[0],
+	      (unsigned int *)&c->x86_vendor_id[8],
+	      (unsigned int *)&c->x86_vendor_id[4]);
+
+	c->x86 = 4;
+	/* Intel-defined flags: level 0x00000001 */
+	if (c->cpuid_level >= 0x00000001) {
+		u32 junk, tfms, cap0, misc;
+
+		cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
+		c->x86 = (tfms >> 8) & 0xf;
+		c->x86_model = (tfms >> 4) & 0xf;
+		c->x86_mask = tfms & 0xf;
+
+		if (c->x86 == 0xf)
+			c->x86 += (tfms >> 20) & 0xff;
+		if (c->x86 >= 0x6)
+			c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
+		if (cap0 & (1<<19)) {
+			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+			c->x86_cache_alignment = c->x86_clflush_size;
+		}
+	}
+}
+
+void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+{
+	u32 tfms, xlvl;
+	u32 ebx;
+
+	/* Intel-defined flags: level 0x00000001 */
+	if (c->cpuid_level >= 0x00000001) {
+		u32 capability, excap;
+
+		cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+		c->x86_capability[0] = capability;
+		c->x86_capability[4] = excap;
+	}
+
+	/* Additional Intel-defined flags: level 0x00000007 */
+	if (c->cpuid_level >= 0x00000007) {
+		u32 eax, ebx, ecx, edx;
+
+		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+
+		c->x86_capability[9] = ebx;
+	}
+
+	/* AMD-defined flags: level 0x80000001 */
+	xlvl = cpuid_eax(0x80000000);
+	c->extended_cpuid_level = xlvl;
+
+	if ((xlvl & 0xffff0000) == 0x80000000) {
+		if (xlvl >= 0x80000001) {
+			c->x86_capability[1] = cpuid_edx(0x80000001);
+			c->x86_capability[6] = cpuid_ecx(0x80000001);
+		}
+	}
+
+	if (c->extended_cpuid_level >= 0x80000008) {
+		u32 eax = cpuid_eax(0x80000008);
+
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+#ifdef CONFIG_X86_32
+	else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+		c->x86_phys_bits = 36;
+#endif
+
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
+
+	init_scattered_cpuid_features(c);
+}
+
+static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+	int i;
+
+	/*
+	 * First of all, decide if this is a 486 or higher
+	 * It's a 486 if we can modify the AC flag
+	 */
+	if (flag_is_changeable_p(X86_EFLAGS_AC))
+		c->x86 = 4;
+	else
+		c->x86 = 3;
+
+	for (i = 0; i < X86_VENDOR_NUM; i++)
+		if (cpu_devs[i] && cpu_devs[i]->c_identify) {
+			c->x86_vendor_id[0] = 0;
+			cpu_devs[i]->c_identify(c);
+			if (c->x86_vendor_id[0]) {
+				get_cpu_vendor(c);
+				break;
+			}
+		}
+#endif
+}
+
+/*
+ * Do minimum CPU detection early.
+ * Fields really needed: vendor, cpuid_level, family, model, mask,
+ * cache alignment.
+ * The others are not touched to avoid unwanted side effects.
+ *
+ * WARNING: this function is only called on the BP.  Don't add code here
+ * that is supposed to run on all CPUs.
+ */
+static void __init early_identify_cpu(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+	c->x86_clflush_size = 64;
+	c->x86_phys_bits = 36;
+	c->x86_virt_bits = 48;
+#else
+	c->x86_clflush_size = 32;
+	c->x86_phys_bits = 32;
+	c->x86_virt_bits = 32;
+#endif
+	c->x86_cache_alignment = c->x86_clflush_size;
+
+	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+	c->extended_cpuid_level = 0;
+
+	if (!have_cpuid_p())
+		identify_cpu_without_cpuid(c);
+
+	/* cyrix could have cpuid enabled via c_identify()*/
+	if (!have_cpuid_p())
+		return;
+
+	cpu_detect(c);
+
+	get_cpu_vendor(c);
+
+	get_cpu_cap(c);
+#ifdef CONFIG_XEN
+	if (!cpu_has_xsave)
+		x86_xsave_setup(NULL);
+#endif
+
+	if (this_cpu->c_early_init)
+		this_cpu->c_early_init(c);
+
+	c->cpu_index = 0;
+	filter_cpuid_features(c, false);
+
+	setup_smep(c);
+
+	if (this_cpu->c_bsp_init)
+		this_cpu->c_bsp_init(c);
+}
+
+void __init early_cpu_init(void)
+{
+	const struct cpu_dev *const *cdev;
+	int count = 0;
+
+#ifdef CONFIG_PROCESSOR_SELECT
+	printk(KERN_INFO "KERNEL supported cpus:\n");
+#endif
+
+	for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
+		const struct cpu_dev *cpudev = *cdev;
+
+		if (count >= X86_VENDOR_NUM)
+			break;
+		cpu_devs[count] = cpudev;
+		count++;
+
+#ifdef CONFIG_PROCESSOR_SELECT
+		{
+			unsigned int j;
+
+			for (j = 0; j < 2; j++) {
+				if (!cpudev->c_ident[j])
+					continue;
+				printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
+					cpudev->c_ident[j]);
+			}
+		}
+#endif
+	}
+	early_identify_cpu(&boot_cpu_data);
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
+ * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
+ */
+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+	clear_cpu_cap(c, X86_FEATURE_NOPL);
+#else
+	set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
+}
+
+static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
+{
+	c->extended_cpuid_level = 0;
+
+	if (!have_cpuid_p())
+		identify_cpu_without_cpuid(c);
+
+	/* cyrix could have cpuid enabled via c_identify()*/
+	if (!have_cpuid_p())
+		return;
+
+	cpu_detect(c);
+
+	get_cpu_vendor(c);
+
+	get_cpu_cap(c);
+
+#ifndef CONFIG_XEN
+	if (c->cpuid_level >= 0x00000001) {
+		c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_HT
+		c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+# else
+		c->apicid = c->initial_apicid;
+# endif
+#endif
+		c->phys_proc_id = c->initial_apicid;
+	}
+#endif
+
+	setup_smep(c);
+
+	get_model_name(c); /* Default name */
+
+	detect_nopl(c);
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+	int i;
+
+	c->loops_per_jiffy = loops_per_jiffy;
+	c->x86_cache_size = -1;
+	c->x86_vendor = X86_VENDOR_UNKNOWN;
+	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
+	c->x86_vendor_id[0] = '\0'; /* Unset */
+	c->x86_model_id[0] = '\0';  /* Unset */
+#ifndef CONFIG_XEN
+	c->x86_max_cores = 1;
+	c->x86_coreid_bits = 0;
+#endif
+#ifdef CONFIG_X86_64
+	c->x86_clflush_size = 64;
+	c->x86_phys_bits = 36;
+	c->x86_virt_bits = 48;
+#else
+	c->cpuid_level = -1;	/* CPUID not detected */
+	c->x86_clflush_size = 32;
+	c->x86_phys_bits = 32;
+	c->x86_virt_bits = 32;
+#endif
+	c->x86_cache_alignment = c->x86_clflush_size;
+	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+	if (boot_cpu_has(X86_FEATURE_SYSCALL32))
+		set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+
+	generic_identify(c);
+
+	if (this_cpu->c_identify)
+		this_cpu->c_identify(c);
+
+	/* Clear/Set all flags overriden by options, after probe */
+	for (i = 0; i < NCAPINTS; i++) {
+		c->x86_capability[i] &= ~cpu_caps_cleared[i];
+		c->x86_capability[i] |= cpu_caps_set[i];
+	}
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+#endif
+
+	/*
+	 * Vendor-specific initialization.  In this section we
+	 * canonicalize the feature flags, meaning if there are
+	 * features a certain CPU supports which CPUID doesn't
+	 * tell us, CPUID claiming incorrect flags, or other bugs,
+	 * we handle them here.
+	 *
+	 * At the end of this section, c->x86_capability better
+	 * indicate the features this CPU genuinely supports!
+	 */
+	if (this_cpu->c_init)
+		this_cpu->c_init(c);
+
+	/* Disable the PN if appropriate */
+	squash_the_stupid_serial_number(c);
+
+	/*
+	 * The vendor-specific functions might have changed features.
+	 * Now we do "generic changes."
+	 */
+
+	/* Filter out anything that depends on CPUID levels we don't have */
+	filter_cpuid_features(c, true);
+
+	/* If the model name is still unset, do table lookup. */
+	if (!c->x86_model_id[0]) {
+		const char *p;
+		p = table_lookup_model(c);
+		if (p)
+			strcpy(c->x86_model_id, p);
+		else
+			/* Last resort... */
+			sprintf(c->x86_model_id, "%02x/%02x",
+				c->x86, c->x86_model);
+	}
+
+#ifdef CONFIG_X86_64
+	detect_ht(c);
+#endif
+
+	init_hypervisor(c);
+	x86_init_rdrand(c);
+
+	/*
+	 * Clear/Set all flags overriden by options, need do it
+	 * before following smp all cpus cap AND.
+	 */
+	for (i = 0; i < NCAPINTS; i++) {
+		c->x86_capability[i] &= ~cpu_caps_cleared[i];
+		c->x86_capability[i] |= cpu_caps_set[i];
+	}
+
+	/*
+	 * On SMP, boot_cpu_data holds the common feature set between
+	 * all CPUs; so make sure that we indicate which features are
+	 * common between the CPUs.  The first time this routine gets
+	 * executed, c == &boot_cpu_data.
+	 */
+	if (c != &boot_cpu_data) {
+		/* AND the already accumulated flags with these */
+		for (i = 0; i < NCAPINTS; i++)
+			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+	}
+
+	/* Init Machine Check Exception if available. */
+	mcheck_cpu_init(c);
+
+	select_idle_routine(c);
+
+#ifdef CONFIG_NUMA
+	numa_add_cpu(smp_processor_id());
+#endif
+}
+
+#ifdef CONFIG_X86_64
+static void vgetcpu_set_mode(void)
+{
+	if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+		vgetcpu_mode = VGETCPU_RDTSCP;
+	else
+		vgetcpu_mode = VGETCPU_LSL;
+}
+#endif
+
+void __init identify_boot_cpu(void)
+{
+	identify_cpu(&boot_cpu_data);
+	init_amd_e400_c1e_mask();
+#ifdef CONFIG_X86_32
+	sysenter_setup();
+	enable_sep_cpu();
+#else
+	vgetcpu_set_mode();
+#endif
+}
+
+#ifdef CONFIG_XEN
+void set_perf_event_pending(void) {}
+#endif
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+	BUG_ON(c == &boot_cpu_data);
+	identify_cpu(c);
+#ifdef CONFIG_X86_32
+	enable_sep_cpu();
+#endif
+	mtrr_ap_init();
+}
+
+struct msr_range {
+	unsigned	min;
+	unsigned	max;
+};
+
+static const struct msr_range msr_range_array[] __cpuinitconst = {
+	{ 0x00000000, 0x00000418},
+	{ 0xc0000000, 0xc000040b},
+	{ 0xc0010000, 0xc0010142},
+	{ 0xc0011000, 0xc001103b},
+};
+
+static void __cpuinit print_cpu_msr(void)
+{
+	unsigned index_min, index_max;
+	unsigned index;
+	u64 val;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
+		index_min = msr_range_array[i].min;
+		index_max = msr_range_array[i].max;
+
+		for (index = index_min; index < index_max; index++) {
+			if (rdmsrl_amd_safe(index, &val))
+				continue;
+			printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+		}
+	}
+}
+
+static int show_msr __cpuinitdata;
+
+static __init int setup_show_msr(char *arg)
+{
+	int num;
+
+	get_option(&arg, &num);
+
+	if (num > 0)
+		show_msr = num;
+	return 1;
+}
+__setup("show_msr=", setup_show_msr);
+
+static __init int setup_noclflush(char *arg)
+{
+	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+	return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+	const char *vendor = NULL;
+
+	if (c->x86_vendor < X86_VENDOR_NUM) {
+		vendor = this_cpu->c_vendor;
+	} else {
+		if (c->cpuid_level >= 0)
+			vendor = c->x86_vendor_id;
+	}
+
+	if (vendor && !strstr(c->x86_model_id, vendor))
+		printk(KERN_CONT "%s ", vendor);
+
+	if (c->x86_model_id[0])
+		printk(KERN_CONT "%s", c->x86_model_id);
+	else
+		printk(KERN_CONT "%d86", c->x86);
+
+	if (c->x86_mask || c->cpuid_level >= 0)
+		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+	else
+		printk(KERN_CONT "\n");
+
+#ifdef CONFIG_SMP
+	if (c->cpu_index < show_msr)
+		print_cpu_msr();
+#else
+	if (show_msr)
+		print_cpu_msr();
+#endif
+}
+
+static __init int setup_disablecpuid(char *arg)
+{
+	int bit;
+
+	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+		setup_clear_cpu_cap(bit);
+	else
+		return 0;
+
+	return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
+#ifdef CONFIG_X86_64
+#ifndef CONFIG_X86_NO_IDT
+struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
+struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
+				    (unsigned long) nmi_idt_table };
+#endif
+
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+		     irq_stack_union) __aligned(PAGE_SIZE);
+
+void xen_switch_pt(void)
+{
+#ifdef CONFIG_XEN
+	xen_pt_switch(init_level4_pgt);
+#endif
+}
+
+/*
+ * The following four percpu variables are hot.  Align current_task to
+ * cacheline size such that all four fall in the same cacheline.
+ */
+DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
+	&init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
+	(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernel_stack);
+
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+	init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+
+DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
+
+#ifndef CONFIG_X86_NO_TSS
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,
+	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
+};
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+void __cpuinit syscall_init(void)
+{
+#ifndef CONFIG_XEN
+	/*
+	 * LSTAR and STAR live in a bit strange symbiosis.
+	 * They both write to the same internal register. STAR allows to
+	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+	 */
+	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
+	wrmsrl(MSR_LSTAR, system_call);
+	wrmsrl(MSR_CSTAR, ignore_sysret);
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
+	syscall32_cpu_init();
+#elif defined(CONFIG_XEN)
+	static const struct callback_register __cpuinitconst cstar = {
+		.type = CALLBACKTYPE_syscall32,
+		.address = (unsigned long)ignore_sysret
+	};
+
+	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
+		printk(KERN_WARNING "Unable to register CSTAR callback\n");
+#endif
+
+#ifndef CONFIG_XEN
+	/* Flags to clear on syscall */
+	wrmsrl(MSR_SYSCALL_MASK,
+	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+#endif
+}
+
+unsigned long kernel_eflags;
+
+#ifndef CONFIG_X86_NO_TSS
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+#ifndef CONFIG_X86_NO_IDT
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
+
+int is_debug_stack(unsigned long addr)
+{
+	return __get_cpu_var(debug_stack_usage) ||
+		(addr <= __get_cpu_var(debug_stack_addr) &&
+		 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
+}
+
+void debug_stack_set_zero(void)
+{
+	load_idt((const struct desc_ptr *)&nmi_idt_descr);
+}
+
+void debug_stack_reset(void)
+{
+	load_idt((const struct desc_ptr *)&idt_descr);
+}
+#endif
+
+#else	/* CONFIG_X86_64 */
+
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+#endif
+
+/* Make sure %fs and %gs are initialized properly in idle threads */
+struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
+{
+	memset(regs, 0, sizeof(struct pt_regs));
+	regs->fs = __KERNEL_PERCPU;
+	regs->gs = __KERNEL_STACK_CANARY;
+
+	return regs;
+}
+#endif	/* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		/* Ignore db4, db5 */
+		if ((i == 4) || (i == 5))
+			continue;
+
+		set_debugreg(0, i);
+	}
+}
+
+#ifdef CONFIG_KGDB
+/*
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
+ */
+static void dbg_restore_debug_regs(void)
+{
+	if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+		arch_kgdb_ops.correct_hw_break();
+}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
+
+#ifndef CONFIG_XEN
+/*
+ * Prints an error where the NUMA and configured core-number mismatch and the
+ * platform didn't override this to fix it up
+ */
+void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+	pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
+}
+#endif
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init for 64 bit
+ */
+#ifdef CONFIG_X86_64
+
+void __cpuinit cpu_init(void)
+{
+#ifndef CONFIG_X86_NO_TSS
+	struct orig_ist *oist;
+	struct tss_struct *t;
+	unsigned long v;
+	int i;
+#endif
+	struct task_struct *me;
+	int cpu;
+
+	cpu = stack_smp_processor_id();
+	/* CPU 0 is initialised in head64.c */
+	if (cpu != 0)
+		xen_switch_pt();
+#ifndef CONFIG_X86_NO_TSS
+	t = &per_cpu(init_tss, cpu);
+	oist = &per_cpu(orig_ist, cpu);
+#endif
+
+#ifdef CONFIG_NUMA
+	if (cpu != 0 && percpu_read(numa_node) == 0 &&
+	    early_cpu_to_node(cpu) != NUMA_NO_NODE)
+		set_numa_node(early_cpu_to_node(cpu));
+#endif
+
+	me = current;
+
+	if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
+		panic("CPU#%d already initialized!\n", cpu);
+
+	pr_debug("Initializing CPU#%d\n", cpu);
+
+	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+	/*
+	 * Initialize the per-CPU GDT with the boot GDT,
+	 * and set up the GDT descriptor:
+	 */
+
+	switch_to_new_gdt(cpu);
+	loadsegment(fs, 0);
+
+#ifndef CONFIG_X86_NO_IDT
+	load_idt((const struct desc_ptr *)&idt_descr);
+#endif
+
+	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+	syscall_init();
+
+	wrmsrl(MSR_FS_BASE, 0);
+	wrmsrl(MSR_KERNEL_GS_BASE, 0);
+	barrier();
+
+	x86_configure_nx();
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (cpu != 0)
+		enable_x2apic();
+#endif
+
+#ifndef CONFIG_X86_NO_TSS
+	/*
+	 * set up and load the per-CPU TSS
+	 */
+	if (!oist->ist[0]) {
+		char *estacks = per_cpu(exception_stacks, cpu);
+
+		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+			estacks += exception_stack_sizes[v];
+			oist->ist[v] = t->x86_tss.ist[v] =
+					(unsigned long)estacks;
+#ifndef CONFIG_X86_NO_IDT
+			if (v == DEBUG_STACK-1)
+				per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
+#endif
+		}
+	}
+
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
+	/*
+	 * <= is required because the CPU will access up to
+	 * 8 bits beyond the end of the IO permission bitmap.
+	 */
+	for (i = 0; i <= IO_BITMAP_LONGS; i++)
+		t->io_bitmap[i] = ~0UL;
+#endif
+
+	atomic_inc(&init_mm.mm_count);
+	me->active_mm = &init_mm;
+	BUG_ON(me->mm);
+	enter_lazy_tlb(&init_mm, me);
+
+	load_sp0(t, &current->thread);
+#ifndef CONFIG_X86_NO_TSS
+	set_tss_desc(cpu, t);
+	load_TR_desc();
+#endif
+	load_LDT(&init_mm.context);
+
+	clear_all_debug_regs();
+	dbg_restore_debug_regs();
+
+	fpu_init();
+	xsave_init();
+
+#ifndef CONFIG_XEN
+	raw_local_save_flags(kernel_eflags);
+#else
+	asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
+	if (raw_irqs_disabled())
+		kernel_eflags &= ~X86_EFLAGS_IF;
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (is_uv_system())
+		uv_cpu_init();
+#endif
+}
+
+#else
+
+void __cpuinit cpu_init(void)
+{
+	int cpu = smp_processor_id();
+	struct task_struct *curr = current;
+#ifndef CONFIG_X86_NO_TSS
+	struct tss_struct *t = &per_cpu(init_tss, cpu);
+#endif
+	struct thread_struct *thread = &curr->thread;
+
+	if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
+		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
+		for (;;)
+			local_irq_enable();
+	}
+
+	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+	if (cpu_has_vme || cpu_has_de)
+		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+	switch_to_new_gdt(cpu);
+
+	/*
+	 * Set up and load the per-CPU TSS and LDT
+	 */
+	atomic_inc(&init_mm.mm_count);
+	curr->active_mm = &init_mm;
+	BUG_ON(curr->mm);
+	enter_lazy_tlb(&init_mm, curr);
+
+	load_sp0(t, thread);
+
+	load_LDT(&init_mm.context);
+
+#ifndef CONFIG_X86_NO_TSS
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+#endif
+
+#ifdef CONFIG_DOUBLEFAULT
+	/* Set up doublefault TSS pointer in the GDT */
+	__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+#endif
+
+	clear_all_debug_regs();
+	dbg_restore_debug_regs();
+
+	fpu_init();
+	xsave_init();
+}
+#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 3e6ff6c..c55ce62 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -36,10 +36,15 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 
 		if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
+#ifndef CONFIG_XEN
 			misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
 			wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 			c->cpuid_level = cpuid_eax(0);
 			get_cpu_cap(c);
+#else
+			pr_warning("CPUID levels are restricted -"
+				   " update hypervisor\n");
+#endif
 		}
 	}
 
@@ -47,6 +52,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 		(c->x86 == 0x6 && c->x86_model >= 0x0e))
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 
+#ifndef CONFIG_XEN
 	if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
 		unsigned lower_word;
 
@@ -69,6 +75,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 		printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
 		clear_cpu_cap(c, X86_FEATURE_PSE);
 	}
+#endif
 
 #ifdef CONFIG_X86_64
 	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
@@ -93,8 +100,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+#ifndef CONFIG_XEN
 		if (!check_tsc_unstable())
 			sched_clock_stable = 1;
+#endif
 	}
 
 	/*
@@ -238,9 +247,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 		rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
 		if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
 			printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
+#ifndef CONFIG_XEN
 			printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
 			lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
 			wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
+#else
+			pr_warning("CPU: Hypervisor update needed\n");
+#endif
 		}
 	}
 
@@ -285,6 +298,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 }
 #endif
 
+#ifndef CONFIG_XEN
 static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_NUMA
@@ -357,6 +371,7 @@ static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_VPID);
 	}
 }
+#endif
 
 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
@@ -440,6 +455,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_P3);
 #endif
 
+#ifndef CONFIG_XEN
 	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
 		/*
 		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
@@ -456,6 +472,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);
+#endif
 
 	/*
 	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b45e5e..2c06ab4 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -279,8 +279,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 	eax->split.type = types[leaf];
 	eax->split.level = levels[leaf];
 	eax->split.num_threads_sharing = 0;
+#ifndef CONFIG_XEN
 	eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
-
+#endif
 
 	if (assoc == 0xffff)
 		eax->split.is_fully_associative = 1;
@@ -298,7 +299,7 @@ struct _cache_attr {
 			 unsigned int);
 };
 
-#ifdef CONFIG_AMD_NB
+#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN)
 
 /*
  * L3 cache descriptors
@@ -579,8 +580,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
-	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
 #ifdef CONFIG_X86_HT
+	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
 	unsigned int cpu = c->cpu_index;
 #endif
 
@@ -614,16 +615,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 					break;
 				case 2:
 					new_l2 = this_leaf.size/1024;
+#ifdef CONFIG_X86_HT
 					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
 					index_msb = get_count_order(num_threads_sharing);
 					l2_id = c->apicid >> index_msb;
+#endif
 					break;
 				case 3:
 					new_l3 = this_leaf.size/1024;
+#ifdef CONFIG_X86_HT
 					num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
 					index_msb = get_count_order(
 							num_threads_sharing);
 					l3_id = c->apicid >> index_msb;
+#endif
 					break;
 				default:
 					break;
@@ -724,7 +729,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
 #define CPUID4_INFO_IDX(x, y)	(&((per_cpu(ici_cpuid4_info, x))[y]))
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 {
 	struct _cpuid4_info	*this_leaf, *sibling_leaf;
@@ -954,7 +959,7 @@ static struct attribute *default_attrs[] = {
 	NULL
 };
 
-#ifdef CONFIG_AMD_NB
+#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN)
 static struct attribute ** __cpuinit amd_l3_attrs(void)
 {
 	static struct attribute **attrs;
@@ -1100,7 +1105,7 @@ static int __cpuinit cache_add_dev(struct device *dev)
 		this_leaf = CPUID4_INFO_IDX(cpu, i);
 
 		ktype_cache.default_attrs = default_attrs;
-#ifdef CONFIG_AMD_NB
+#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN)
 		if (this_leaf->base.nb)
 			ktype_cache.default_attrs = amd_l3_attrs();
 #endif
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index bb34b03..21e0a8a 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -3,6 +3,7 @@ obj-y				=  mce.o mce-severity.o
 obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd.o
+obj-$(CONFIG_X86_XEN_MCE)	+= mce_dom0.o
 obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
 obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index fc4beb3..abdbadd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -93,6 +93,7 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
 	return NMI_HANDLED;
 }
 
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
 static void mce_irq_ipi(void *info)
 {
 	int cpu = smp_processor_id();
@@ -104,6 +105,7 @@ static void mce_irq_ipi(void *info)
 		raise_exception(m, NULL);
 	}
 }
+#endif
 
 /* Inject mce on current CPU */
 static int raise_local(void)
@@ -151,7 +153,7 @@ static void raise_mce(struct mce *m)
 	if (context == MCJ_CTX_RANDOM)
 		return;
 
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
 	if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {
 		unsigned long start;
 		int cpu;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2..4457a8f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -118,8 +118,10 @@ void mce_setup(struct mce *m)
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
 	m->cpuid = cpuid_eax(1);
+#ifndef CONFIG_XEN
 	m->socketid = cpu_data(m->extcpu).phys_proc_id;
 	m->apicid = cpu_data(m->extcpu).initial_apicid;
+#endif
 	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 }
 
@@ -266,9 +268,14 @@ static void print_mce(struct mce *m)
 	 * Note this output is parsed by external tools and old fields
 	 * should not be changed.
 	 */
+#ifndef CONFIG_XEN
 	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 		cpu_data(m->extcpu).microcode);
+#else
+	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
+#endif
 
 	/*
 	 * Print out human-readable details about the MCE error,
@@ -1153,8 +1160,15 @@ void mce_log_therm_throt_event(__u64 status)
  * Periodic polling timer for "silent" machine check errors.  If the
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
  * errors, poll 2x slower (up to check_interval seconds).
+ *
+ * We will disable polling in DOM0 since all CMCI/Polling
+ * mechanism will be done in XEN for Intel CPUs
  */
+#if defined (CONFIG_X86_XEN_MCE)
+static int check_interval = 0; /* disable polling */
+#else
 static int check_interval = 5 * 60; /* 5 minutes */
+#endif
 
 static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1329,6 +1343,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
+#ifndef CONFIG_XEN
 		if (c->x86 == 15 && banks > 4) {
 			/*
 			 * disable GART TBL walk error reporting, which
@@ -1337,6 +1352,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 			 */
 			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
 		}
+#endif
 		if (c->x86 <= 17 && mce_bootlog < 0) {
 			/*
 			 * Lots of broken BIOS around that don't clear them
@@ -1409,6 +1425,7 @@ static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 
 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 {
+#ifndef CONFIG_X86_64_XEN
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
 		mce_intel_feature_init(c);
@@ -1419,6 +1436,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	default:
 		break;
 	}
+#endif
 }
 
 static void __mcheck_cpu_init_timer(void)
@@ -2201,6 +2219,16 @@ static __init int mcheck_init_device(void)
 	/* register character device /dev/mcelog */
 	misc_register(&mce_chrdev_device);
 
+#ifdef CONFIG_X86_XEN_MCE
+	if (is_initial_xendomain()) {
+		/* Register vIRQ handler for MCE LOG processing */
+		extern int bind_virq_for_mce(void);
+
+		printk(KERN_DEBUG "MCE: bind virq for DOM0 logging\n");
+		bind_virq_for_mce();
+	}
+#endif
+
 	return err;
 }
 device_initcall(mcheck_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_dom0.c b/arch/x86/kernel/cpu/mcheck/mce_dom0.c
new file mode 100644
index 0000000..b6d5c3e
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_dom0.c
@@ -0,0 +1,185 @@
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <xen/interface/xen.h>
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+#include <asm/hypercall.h>
+#include <asm/mce.h>
+
+static xen_mc_logical_cpu_t *g_physinfo;
+static unsigned int ncpus;
+
+static int convert_log(struct mc_info *mi)
+{
+	struct mcinfo_common *mic = NULL;
+	struct mcinfo_global *mc_global;
+	struct mcinfo_bank *mc_bank;
+	struct mce m;
+	unsigned int i;
+	bool found = false;
+
+	x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
+	if (mic == NULL)
+	{
+		pr_err("DOM0_MCE_LOG: global data is NULL\n");
+		return -1;
+	}
+
+	mce_setup(&m);
+	mc_global = (struct mcinfo_global*)mic;
+	m.mcgstatus = mc_global->mc_gstatus;
+	m.apicid = mc_global->mc_apicid;
+
+	for (i = 0; i < ncpus; i++)
+		if (g_physinfo[i].mc_apicid == m.apicid) {
+			found = true;
+			break;
+		}
+	WARN_ON_ONCE(!found);
+	m.socketid = mc_global->mc_socketid;
+	m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
+	m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
+
+	x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK);
+	do
+	{
+		if (mic == NULL || mic->size == 0)
+			break;
+		if (mic->type == MC_TYPE_BANK)
+		{
+			mc_bank = (struct mcinfo_bank*)mic;
+			m.misc = mc_bank->mc_misc;
+			m.status = mc_bank->mc_status;
+			m.addr = mc_bank->mc_addr;
+			m.tsc = mc_bank->mc_tsc;
+			m.bank = mc_bank->mc_bank;
+			printk(KERN_DEBUG "[CPU%d, BANK%d, addr %llx, state %llx]\n", 
+						m.bank, m.cpu, m.addr, m.status);
+			/*log this record*/
+			mce_log(&m);
+		}
+		mic = x86_mcinfo_next(mic);
+	}while (1);
+
+	return 0;
+}
+
+static struct mc_info *g_mi;
+
+/*dom0 mce virq handler, logging physical mce error info*/
+
+static irqreturn_t mce_dom0_interrupt(int irq, void *dev_id)
+{
+	xen_mc_t mc_op;
+	int result = 0;
+
+	printk(KERN_DEBUG "MCE_DOM0_LOG: enter dom0 mce vIRQ handler\n");
+	mc_op.cmd = XEN_MC_fetch;
+	set_xen_guest_handle(mc_op.u.mc_fetch.data, g_mi);
+urgent:
+	mc_op.u.mc_fetch.flags = XEN_MC_URGENT;
+	result = HYPERVISOR_mca(&mc_op);
+	if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+			mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+	{
+		printk(KERN_DEBUG "MCE_DOM0_LOG: No more urgent data\n");
+		goto nonurgent;
+	}
+	else
+	{
+		result = convert_log(g_mi);
+		if (result) {
+			pr_err("MCE_DOM0_LOG: Log conversion failed\n");
+			goto end;
+		}
+		/* After fetching the telem from DOM0, we need to dec the telem's
+		 * refcnt and release the entry. The telem is reserved and inc
+		 * refcnt when filling the telem.
+		 */
+		mc_op.u.mc_fetch.flags = XEN_MC_URGENT | XEN_MC_ACK;
+		result = HYPERVISOR_mca(&mc_op);
+
+		goto urgent;
+	}
+nonurgent:
+	mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT;
+	result = HYPERVISOR_mca(&mc_op);
+	if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+			mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+	{
+		printk(KERN_DEBUG "MCE_DOM0_LOG: No more nonurgent data\n");
+		goto end;
+	}
+	else
+	{
+		result = convert_log(g_mi);
+		if (result) {
+			pr_err("MCE_DOM0_LOG: Log conversion failed\n");
+			goto end;
+		}
+		/* After fetching the telem from DOM0, we need to dec the telem's
+		 * refcnt and release the entry. The telem is reserved and inc
+		 * refcnt when filling the telem.
+		 */
+		mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT | XEN_MC_ACK;
+		result = HYPERVISOR_mca(&mc_op);
+
+		goto nonurgent;
+	}
+end:
+	return IRQ_HANDLED;
+}
+
+int __init bind_virq_for_mce(void)
+{
+	int ret;
+	xen_mc_t mc_op;
+
+	g_mi = kmalloc(sizeof(*g_mi), GFP_KERNEL);
+	if (!g_mi)
+		return -ENOMEM;
+
+	/* fetch physical CPU count */
+	mc_op.cmd = XEN_MC_physcpuinfo;
+	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, NULL);
+	ret = HYPERVISOR_mca(&mc_op);
+	if (ret) {
+		pr_err("MCE: Failed to get physical CPU count\n");
+		kfree(g_mi);
+		return ret;
+	}
+
+	/* fetch CPU physical info for later reference */
+	ncpus = mc_op.u.mc_physcpuinfo.ncpus;
+	g_physinfo = kmalloc(sizeof(*g_physinfo) * ncpus, GFP_KERNEL);
+	if (!g_physinfo) {
+		kfree(g_mi);
+		return -ENOMEM;
+	}
+	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+	ret = HYPERVISOR_mca(&mc_op);
+	if (ret) {
+		pr_err("MCE: Failed to get physical CPUs' info\n");
+		kfree(g_mi);
+		kfree(g_physinfo);
+		return ret;
+	}
+
+	ret  = bind_virq_to_irqhandler(VIRQ_MCA, 0, 
+		mce_dom0_interrupt, 0, "mce", NULL);
+
+	if (ret < 0) {
+		pr_err("MCE: Failed to bind vIRQ for Dom0\n");
+		kfree(g_mi);
+		kfree(g_physinfo);
+		return ret;
+	}
+
+	/* Log the machine checks left over from the previous reset. */
+	mce_dom0_interrupt(VIRQ_MCA, NULL);
+
+	return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index ad9e5ed..b854116 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,4 @@
 obj-y		:= main.o if.o generic.o cleanup.o
 obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
 
+obj-$(CONFIG_XEN) := main.o if.o
diff --git a/arch/x86/kernel/cpu/mtrr/main-xen.c b/arch/x86/kernel/cpu/mtrr/main-xen.c
new file mode 100644
index 0000000..013e120
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
@@ -0,0 +1,326 @@
+#define DEBUG
+
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+
+#include <asm/mtrr.h>
+#include "mtrr.h"
+
+static DEFINE_MUTEX(mtrr_mutex);
+
+void generic_get_mtrr(unsigned int reg, unsigned long *base,
+		      unsigned long *size, mtrr_type * type)
+{
+	struct xen_platform_op op;
+
+	op.cmd = XENPF_read_memtype;
+	op.u.read_memtype.reg = reg;
+	if (unlikely(HYPERVISOR_platform_op(&op)))
+		memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype));
+
+	*size = op.u.read_memtype.nr_mfns;
+	*base = op.u.read_memtype.mfn;
+	*type = op.u.read_memtype.type;
+}
+
+const struct mtrr_ops generic_mtrr_ops = {
+	.use_intel_if      = 1,
+	.get               = generic_get_mtrr,
+};
+
+const struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
+unsigned int num_var_ranges;
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
+
+static u64 tom2;
+
+static void __init set_num_var_ranges(void)
+{
+	struct xen_platform_op op;
+
+	for (num_var_ranges = 0; ; num_var_ranges++) {
+		op.cmd = XENPF_read_memtype;
+		op.u.read_memtype.reg = num_var_ranges;
+		if (HYPERVISOR_platform_op(&op) != 0)
+			break;
+	}
+}
+
+static void __init init_table(void)
+{
+	int i, max;
+
+	max = num_var_ranges;
+	for (i = 0; i < max; i++)
+		mtrr_usage_table[i] = 0;
+}
+
+int mtrr_add_page(unsigned long base, unsigned long size,
+		  unsigned int type, bool increment)
+{
+	int error;
+	struct xen_platform_op op;
+
+	mutex_lock(&mtrr_mutex);
+
+	op.cmd = XENPF_add_memtype;
+	op.u.add_memtype.mfn     = base;
+	op.u.add_memtype.nr_mfns = size;
+	op.u.add_memtype.type    = type;
+	error = HYPERVISOR_platform_op(&op);
+	if (error) {
+		mutex_unlock(&mtrr_mutex);
+		BUG_ON(error > 0);
+		return error;
+	}
+
+	if (increment)
+		++mtrr_usage_table[op.u.add_memtype.reg];
+
+	mutex_unlock(&mtrr_mutex);
+
+	return op.u.add_memtype.reg;
+}
+
+static int mtrr_check(unsigned long base, unsigned long size)
+{
+	if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+		pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
+		pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
+		dump_stack();
+		return -1;
+	}
+	return 0;
+}
+
+int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
+	     bool increment)
+{
+	if (mtrr_check(base, size))
+		return -EINVAL;
+	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
+			     increment);
+}
+EXPORT_SYMBOL(mtrr_add);
+
+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+	unsigned i;
+	mtrr_type ltype;
+	unsigned long lbase, lsize;
+	int error = -EINVAL;
+	struct xen_platform_op op;
+
+	mutex_lock(&mtrr_mutex);
+
+	if (reg < 0) {
+		/*  Search for existing MTRR  */
+		for (i = 0; i < num_var_ranges; ++i) {
+			mtrr_if->get(i, &lbase, &lsize, &ltype);
+			if (lbase == base && lsize == size) {
+				reg = i;
+				break;
+			}
+		}
+		if (reg < 0) {
+			pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
+				 base, size);
+			goto out;
+		}
+	}
+	if (mtrr_usage_table[reg] < 1) {
+		pr_warning("mtrr: reg: %d has count=0\n", reg);
+		goto out;
+	}
+	if (--mtrr_usage_table[reg] < 1) {
+		op.cmd = XENPF_del_memtype;
+		op.u.del_memtype.handle = 0;
+		op.u.del_memtype.reg    = reg;
+		error = HYPERVISOR_platform_op(&op);
+		if (error) {
+			BUG_ON(error > 0);
+			goto out;
+		}
+	}
+	error = reg;
+ out:
+	mutex_unlock(&mtrr_mutex);
+	return error;
+}
+
+int mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+	if (mtrr_check(base, size))
+		return -EINVAL;
+	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(mtrr_del);
+
+/*
+ * Returns the effective MTRR type for the region
+ * Error returns:
+ * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
+ * - 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+	int i, error;
+	u64 start_mfn, end_mfn, base_mfn, top_mfn;
+	u8 prev_match, curr_match;
+	struct xen_platform_op op;
+
+	if (!is_initial_xendomain())
+		return MTRR_TYPE_WRBACK;
+
+	if (!num_var_ranges)
+		return 0xFF;
+
+	start_mfn = start >> PAGE_SHIFT;
+	/* Make end inclusive end, instead of exclusive */
+	end_mfn = --end >> PAGE_SHIFT;
+
+	/* Look in fixed ranges. Just return the type as per start */
+	if (start_mfn < 0x100) {
+#if 0//todo
+		op.cmd = XENPF_read_memtype;
+		op.u.read_memtype.reg = ???;
+		error = HYPERVISOR_platform_op(&op);
+		if (!error)
+			return op.u.read_memtype.type;
+#endif
+		return MTRR_TYPE_UNCACHABLE;
+	}
+
+	/*
+	 * Look in variable ranges
+	 * Look of multiple ranges matching this address and pick type
+	 * as per MTRR precedence
+	 */
+	prev_match = 0xFF;
+	for (i = 0; i < num_var_ranges; ++i) {
+		op.cmd = XENPF_read_memtype;
+		op.u.read_memtype.reg = i;
+		error = HYPERVISOR_platform_op(&op);
+
+		if (error || !op.u.read_memtype.nr_mfns)
+			continue;
+
+		base_mfn = op.u.read_memtype.mfn;
+		top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1;
+
+		if (base_mfn > end_mfn || start_mfn > top_mfn) {
+			continue;
+		}
+
+		if (base_mfn > start_mfn || end_mfn > top_mfn) {
+			return 0xFE;
+		}
+
+		curr_match = op.u.read_memtype.type;
+		if (prev_match == 0xFF) {
+			prev_match = curr_match;
+			continue;
+		}
+
+		if (prev_match == MTRR_TYPE_UNCACHABLE ||
+		    curr_match == MTRR_TYPE_UNCACHABLE) {
+			return MTRR_TYPE_UNCACHABLE;
+		}
+
+		if ((prev_match == MTRR_TYPE_WRBACK &&
+		     curr_match == MTRR_TYPE_WRTHROUGH) ||
+		    (prev_match == MTRR_TYPE_WRTHROUGH &&
+		     curr_match == MTRR_TYPE_WRBACK)) {
+			prev_match = MTRR_TYPE_WRTHROUGH;
+			curr_match = MTRR_TYPE_WRTHROUGH;
+		}
+
+		if (prev_match != curr_match) {
+			return MTRR_TYPE_UNCACHABLE;
+		}
+	}
+
+	if (tom2) {
+		if (start >= (1ULL<<32) && (end < tom2))
+			return MTRR_TYPE_WRBACK;
+	}
+
+	if (prev_match != 0xFF)
+		return prev_match;
+
+#if 0//todo
+	op.cmd = XENPF_read_def_memtype;
+	error = HYPERVISOR_platform_op(&op);
+	if (!error)
+		return op.u.read_def_memtype.type;
+#endif
+	return MTRR_TYPE_UNCACHABLE;
+}
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+static int __init _amd_special_default_mtrr(void)
+{
+	u32 l, h;
+
+	if (!is_initial_xendomain())
+		return 0;
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return 0;
+	if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+		return 0;
+	/* In case some hypervisor doesn't pass SYSCFG through */
+	if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+		return 0;
+	/*
+	 * Memory between 4GB and top of mem is forced WB by this magic bit.
+	 * Reserved before K8RevF, but should be zero there.
+	 */
+	if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+		 (Tom2Enabled | Tom2ForceMemTypeWB))
+		return 1;
+	return 0;
+}
+
+void __init mtrr_bp_init(void)
+{
+	if (_amd_special_default_mtrr()) {
+		/* TOP_MEM2 */
+		rdmsrl(MSR_K8_TOP_MEM2, tom2);
+		tom2 &= 0xffffff8000000ULL;
+	}
+}
+
+void mtrr_ap_init(void)
+{
+}
+
+static int __init mtrr_init(void)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	if (!is_initial_xendomain())
+		return -ENODEV;
+
+	if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
+	    (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
+	    (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
+	    (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
+		return -ENODEV;
+
+	set_num_var_ranges();
+	init_table();
+
+	return 0;
+}
+
+subsys_initcall(mtrr_init);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 8022c66..845e3bd 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -10,7 +10,7 @@
 static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 			      unsigned int cpu)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	if (c->x86_max_cores * smp_num_siblings > 1) {
 		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
 		seq_printf(m, "siblings\t: %d\n",
@@ -32,18 +32,22 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 	 */
 	int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
 	seq_printf(m,
+#ifndef CONFIG_XEN
 		   "fdiv_bug\t: %s\n"
 		   "hlt_bug\t\t: %s\n"
 		   "f00f_bug\t: %s\n"
 		   "coma_bug\t: %s\n"
+#endif
 		   "fpu\t\t: %s\n"
 		   "fpu_exception\t: %s\n"
 		   "cpuid level\t: %d\n"
 		   "wp\t\t: %s\n",
+#ifndef CONFIG_XEN
 		   c->fdiv_bug ? "yes" : "no",
 		   c->hlt_works_ok ? "no" : "yes",
 		   c->f00f_bug ? "yes" : "no",
 		   c->coma_bug ? "yes" : "no",
+#endif
 		   c->hard_math ? "yes" : "no",
 		   fpu_exception ? "yes" : "no",
 		   c->cpuid_level,
@@ -83,8 +87,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		seq_printf(m, "stepping\t: %d\n", c->x86_mask);
 	else
 		seq_printf(m, "stepping\t: unknown\n");
+#ifndef CONFIG_XEN
 	if (c->microcode)
 		seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
+#endif
 
 	if (cpu_has(c, X86_FEATURE_TSC)) {
 		unsigned int freq = cpufreq_quick_get(cpu);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index addf9e8..a1c53de 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_XSAVEOPT,		CR_EAX,	0, 0x0000000d, 1 },
 		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
 		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },
+#ifndef CONFIG_XEN
 		{ X86_FEATURE_NPT,		CR_EDX, 0, 0x8000000a, 0 },
 		{ X86_FEATURE_LBRV,		CR_EDX, 1, 0x8000000a, 0 },
 		{ X86_FEATURE_SVML,		CR_EDX, 2, 0x8000000a, 0 },
@@ -51,6 +52,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_DECODEASSISTS,	CR_EDX, 7, 0x8000000a, 0 },
 		{ X86_FEATURE_PAUSEFILTER,	CR_EDX,10, 0x8000000a, 0 },
 		{ X86_FEATURE_PFTHRESHOLD,	CR_EDX,12, 0x8000000a, 0 },
+#endif
 		{ 0, 0, 0, 0, 0 }
 	};
 
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 4397e98..dc581ec 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -28,7 +28,7 @@
  */
 void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
 	unsigned int eax, ebx, ecx, edx, sub_index;
 	unsigned int ht_mask_width, core_plus_mask_width;
 	unsigned int core_select_mask, core_level_siblings;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index caf5de0..d9b4f99 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -21,6 +21,7 @@
 #define N_EXCEPTION_STACKS_END \
 		(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
 
+#ifndef CONFIG_X86_NO_TSS
 static char x86_stack_ids[][8] = {
 		[ DEBUG_STACK-1			]	= "#DB",
 		[ NMI_STACK-1			]	= "NMI",
@@ -32,10 +33,12 @@ static char x86_stack_ids[][8] = {
 		  N_EXCEPTION_STACKS_END	]	= "#DB[?]"
 #endif
 };
+#endif
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 					 unsigned *usedp, char **idp)
 {
+#ifndef CONFIG_X86_NO_TSS
 	unsigned k;
 
 	/*
@@ -95,6 +98,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 		}
 #endif
 	}
+#endif /* CONFIG_X86_NO_TSS */
 	return NULL;
 }
 
diff --git a/arch/x86/kernel/e820-xen.c b/arch/x86/kernel/e820-xen.c
new file mode 100644
index 0000000..481afb7
--- /dev/null
+++ b/arch/x86/kernel/e820-xen.c
@@ -0,0 +1,1291 @@
+/*
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+ *  Getting sanitize_e820_map() in sync with i386 version by applying change:
+ *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *     Alex Achenbach <xela@slit.de>, December 2002.
+ *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/crash_dump.h>
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/acpi.h>
+#include <linux/firmware-map.h>
+#include <linux/memblock.h>
+#include <linux/sort.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <xen/interface/memory.h>
+
+/*
+ * The e820 map is the map that gets modified e.g. with command line parameters
+ * and that is also registered with modifications in the kernel resource tree
+ * with the iomem_resource as parent.
+ *
+ * The e820_saved is directly saved after the BIOS-provided memory map is
+ * copied. It doesn't get modified afterwards. It's registered for the
+ * /sys/firmware/memmap interface.
+ *
+ * That memory map is not modified and is used as base for kexec. The kexec'd
+ * kernel should get the same memory map as the firmware provides. Then the
+ * user can e.g. boot the original kernel with mem=1G while still booting the
+ * next kernel with full memory.
+ */
+struct e820map e820;
+#if !defined(CONFIG_XEN)
+struct e820map e820_saved;
+#elif defined(CONFIG_XEN_PRIVILEGED_GUEST)
+struct e820map machine_e820;
+# define e820_saved machine_e820
+#else
+# define machine_e820 e820
+# define e820_saved e820
+#endif
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+	int i;
+
+#ifndef CONFIG_XEN
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+#else
+	if (!is_initial_xendomain())
+		return 0;
+	for (i = 0; i < machine_e820.nr_map; ++i) {
+		const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+		if (type && ei->type != type)
+			continue;
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
+{
+	int i;
+
+#ifndef CONFIG_XEN
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+#else
+	if (!is_initial_xendomain())
+		return 0;
+	for (i = 0; i < machine_e820.nr_map; ++i) {
+		const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+		if (type && ei->type != type)
+			continue;
+		/* is the region (part) in overlap with the current region ?*/
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+
+		/* if the region is at the beginning of <start,end> we move
+		 * start to the end of the region since it's ok until there
+		 */
+		if (ei->addr <= start)
+			start = ei->addr + ei->size;
+		/*
+		 * if start is now at or beyond end, we're done, full
+		 * coverage
+		 */
+		if (start >= end)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Add a memory region to the kernel e820 map.
+ */
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+					 int type)
+{
+	int x = e820x->nr_map;
+
+	if (x >= ARRAY_SIZE(e820x->map)) {
+		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+		return;
+	}
+
+	e820x->map[x].addr = start;
+	e820x->map[x].size = size;
+	e820x->map[x].type = type;
+	e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+	__e820_add_region(&e820, start, size, type);
+}
+
+static void __init e820_print_type(u32 type)
+{
+	switch (type) {
+	case E820_RAM:
+	case E820_RESERVED_KERN:
+		printk(KERN_CONT "(usable)");
+		break;
+	case E820_RESERVED:
+		printk(KERN_CONT "(reserved)");
+		break;
+	case E820_ACPI:
+		printk(KERN_CONT "(ACPI data)");
+		break;
+	case E820_NVS:
+		printk(KERN_CONT "(ACPI NVS)");
+		break;
+	case E820_UNUSABLE:
+		printk(KERN_CONT "(unusable)");
+		break;
+	default:
+		printk(KERN_CONT "type %u", type);
+		break;
+	}
+}
+
+static void __init _e820_print_map(const struct e820map *e820, const char *who)
+{
+	int i;
+
+	for (i = 0; i < e820->nr_map; i++) {
+		printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+		       (unsigned long long) e820->map[i].addr,
+		       (unsigned long long)
+		       (e820->map[i].addr + e820->map[i].size));
+		e820_print_type(e820->map[i].type);
+		printk(KERN_CONT "\n");
+	}
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
+ *
+ * The input parameter biosmap points to an array of 'struct
+ * e820entry' which on entry has elements in the range [0, *pnr_map)
+ * valid, and which has space for up to max_nr_map entries.
+ * On return, the resulting sanitized e820 map entries will be in
+ * overwritten in the same location, starting at biosmap.
+ *
+ * The integer pointed to by pnr_map must be valid on entry (the
+ * current number of valid entries located at biosmap) and will
+ * be updated on return, with the new number of valid entries
+ * (something no more than max_nr_map.)
+ *
+ * The return value from sanitize_e820_map() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ *	Visually we're performing the following
+ *	(1,2,3,4 = memory types)...
+ *
+ *	Sample memory map (w/overlaps):
+ *	   ____22__________________
+ *	   ______________________4_
+ *	   ____1111________________
+ *	   _44_____________________
+ *	   11111111________________
+ *	   ____________________33__
+ *	   ___________44___________
+ *	   __________33333_________
+ *	   ______________22________
+ *	   ___________________2222_
+ *	   _________111111111______
+ *	   _____________________11_
+ *	   _________________4______
+ *
+ *	Sanitized equivalent (no overlap):
+ *	   1_______________________
+ *	   _44_____________________
+ *	   ___1____________________
+ *	   ____22__________________
+ *	   ______11________________
+ *	   _________1______________
+ *	   __________3_____________
+ *	   ___________44___________
+ *	   _____________33_________
+ *	   _______________2________
+ *	   ________________1_______
+ *	   _________________4______
+ *	   ___________________2____
+ *	   ____________________33__
+ *	   ______________________4_
+ */
+struct change_member {
+	struct e820entry *pbios; /* pointer to original bios entry */
+	unsigned long long addr; /* address for this change point */
+};
+
+static int __init cpcompare(const void *a, const void *b)
+{
+	struct change_member * const *app = a, * const *bpp = b;
+	const struct change_member *ap = *app, *bp = *bpp;
+
+	/*
+	 * Inputs are pointers to two elements of change_point[].  If their
+	 * addresses are unequal, their difference dominates.  If the addresses
+	 * are equal, then consider one that represents the end of its region
+	 * to be greater than one that does not.
+	 */
+	if (ap->addr != bp->addr)
+		return ap->addr > bp->addr ? 1 : -1;
+
+	return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
+}
+
+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
+			     u32 *pnr_map)
+{
+	static struct change_member change_point_list[2*E820_X_MAX] __initdata;
+	static struct change_member *change_point[2*E820_X_MAX] __initdata;
+	static struct e820entry *overlap_list[E820_X_MAX] __initdata;
+	static struct e820entry new_bios[E820_X_MAX] __initdata;
+	unsigned long current_type, last_type;
+	unsigned long long last_addr;
+	int chgidx;
+	int overlap_entries;
+	int new_bios_entry;
+	int old_nr, new_nr, chg_nr;
+	int i;
+
+	/* if there's only one memory region, don't bother */
+#ifdef CONFIG_XEN
+	if (*pnr_map == 1)
+		return 0;
+#endif
+	if (*pnr_map < 2)
+		return -1;
+
+	old_nr = *pnr_map;
+	BUG_ON(old_nr > max_nr_map);
+
+	/* bail out if we find any unreasonable addresses in bios map */
+	for (i = 0; i < old_nr; i++)
+		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+			return -1;
+
+	/* create pointers for initial change-point information (for sorting) */
+	for (i = 0; i < 2 * old_nr; i++)
+		change_point[i] = &change_point_list[i];
+
+	/* record all known change-points (starting and ending addresses),
+	   omitting those that are for empty memory regions */
+	chgidx = 0;
+	for (i = 0; i < old_nr; i++)	{
+		if (biosmap[i].size != 0) {
+			change_point[chgidx]->addr = biosmap[i].addr;
+			change_point[chgidx++]->pbios = &biosmap[i];
+			change_point[chgidx]->addr = biosmap[i].addr +
+				biosmap[i].size;
+			change_point[chgidx++]->pbios = &biosmap[i];
+		}
+	}
+	chg_nr = chgidx;
+
+	/* sort change-point list by memory addresses (low -> high) */
+	sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
+
+	/* create a new bios memory map, removing overlaps */
+	overlap_entries = 0;	 /* number of entries in the overlap table */
+	new_bios_entry = 0;	 /* index for creating new bios map entries */
+	last_type = 0;		 /* start with undefined memory type */
+	last_addr = 0;		 /* start with 0 as last starting address */
+
+	/* loop through change-points, determining affect on the new bios map */
+	for (chgidx = 0; chgidx < chg_nr; chgidx++) {
+		/* keep track of all overlapping bios entries */
+		if (change_point[chgidx]->addr ==
+		    change_point[chgidx]->pbios->addr) {
+			/*
+			 * add map entry to overlap list (> 1 entry
+			 * implies an overlap)
+			 */
+			overlap_list[overlap_entries++] =
+				change_point[chgidx]->pbios;
+		} else {
+			/*
+			 * remove entry from list (order independent,
+			 * so swap with last)
+			 */
+			for (i = 0; i < overlap_entries; i++) {
+				if (overlap_list[i] ==
+				    change_point[chgidx]->pbios)
+					overlap_list[i] =
+						overlap_list[overlap_entries-1];
+			}
+			overlap_entries--;
+		}
+		/*
+		 * if there are overlapping entries, decide which
+		 * "type" to use (larger value takes precedence --
+		 * 1=usable, 2,3,4,4+=unusable)
+		 */
+		current_type = 0;
+		for (i = 0; i < overlap_entries; i++)
+			if (overlap_list[i]->type > current_type)
+				current_type = overlap_list[i]->type;
+		/*
+		 * continue building up new bios map based on this
+		 * information
+		 */
+		if (current_type != last_type)	{
+			if (last_type != 0)	 {
+				new_bios[new_bios_entry].size =
+					change_point[chgidx]->addr - last_addr;
+				/*
+				 * move forward only if the new size
+				 * was non-zero
+				 */
+				if (new_bios[new_bios_entry].size != 0)
+					/*
+					 * no more space left for new
+					 * bios entries ?
+					 */
+					if (++new_bios_entry >= max_nr_map)
+						break;
+			}
+			if (current_type != 0)	{
+				new_bios[new_bios_entry].addr =
+					change_point[chgidx]->addr;
+				new_bios[new_bios_entry].type = current_type;
+				last_addr = change_point[chgidx]->addr;
+			}
+			last_type = current_type;
+		}
+	}
+	/* retain count for new bios entries */
+	new_nr = new_bios_entry;
+
+	/* copy new bios mapping into original location */
+	memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
+	*pnr_map = new_nr;
+
+	return 0;
+}
+
+static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+	while (nr_map) {
+		u64 start = biosmap->addr;
+		u64 size = biosmap->size;
+		u64 end = start + size;
+		u32 type = biosmap->type;
+
+		/* Overflow in 64 bits? Ignore the memory map. */
+		if (start > end)
+			return -1;
+
+		e820_add_region(start, size, type);
+
+		biosmap++;
+		nr_map--;
+	}
+	return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ */
+static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+#ifndef CONFIG_XEN
+	/* Only one memory region (or negative)? Ignore it */
+	if (nr_map < 2)
+		return -1;
+#else
+	BUG_ON(nr_map < 1);
+#endif
+
+	return __append_e820_map(biosmap, nr_map);
+}
+
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
+					u64 size, unsigned old_type,
+					unsigned new_type)
+{
+	u64 end;
+	unsigned int i;
+	u64 real_updated_size = 0;
+
+	BUG_ON(old_type == new_type);
+
+	if (size > (ULLONG_MAX - start))
+		size = ULLONG_MAX - start;
+
+	end = start + size;
+	printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
+		       (unsigned long long) start,
+		       (unsigned long long) end);
+	e820_print_type(old_type);
+	printk(KERN_CONT " ==> ");
+	e820_print_type(new_type);
+	printk(KERN_CONT "\n");
+
+	for (i = 0; i < e820x->nr_map; i++) {
+		struct e820entry *ei = &e820x->map[i];
+		u64 final_start, final_end;
+		u64 ei_end;
+
+		if (ei->type != old_type)
+			continue;
+
+		ei_end = ei->addr + ei->size;
+		/* totally covered by new range? */
+		if (ei->addr >= start && ei_end <= end) {
+			ei->type = new_type;
+			real_updated_size += ei->size;
+			continue;
+		}
+
+		/* new range is totally covered? */
+		if (ei->addr < start && ei_end > end) {
+			__e820_add_region(e820x, start, size, new_type);
+			__e820_add_region(e820x, end, ei_end - end, ei->type);
+			ei->size = start - ei->addr;
+			real_updated_size += size;
+			continue;
+		}
+
+		/* partially covered */
+		final_start = max(start, ei->addr);
+		final_end = min(end, ei_end);
+		if (final_start >= final_end)
+			continue;
+
+		__e820_add_region(e820x, final_start, final_end - final_start,
+				  new_type);
+
+		real_updated_size += final_end - final_start;
+
+		/*
+		 * left range could be head or tail, so need to update
+		 * size at first.
+		 */
+		ei->size -= final_end - final_start;
+		if (ei->addr < final_start)
+			continue;
+		ei->addr = final_end;
+	}
+	return real_updated_size;
+}
+
+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
+			     unsigned new_type)
+{
+	return __e820_update_range(&e820, start, size, old_type, new_type);
+}
+
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
+static u64 __init e820_update_range_saved(u64 start, u64 size,
+					  unsigned old_type, unsigned new_type)
+{
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return 0;
+	return __e820_update_range(&machine_e820, phys_to_machine(start),
+				   size, old_type, new_type);
+#else
+	return __e820_update_range(&e820_saved, start, size, old_type,
+				     new_type);
+#endif
+}
+#endif
+
+/* make e820 not cover the range */
+u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
+			     int checktype)
+{
+	int i;
+	u64 end;
+	u64 real_removed_size = 0;
+
+	if (size > (ULLONG_MAX - start))
+		size = ULLONG_MAX - start;
+
+	end = start + size;
+	printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
+		       (unsigned long long) start,
+		       (unsigned long long) end);
+	if (checktype)
+		e820_print_type(old_type);
+	printk(KERN_CONT "\n");
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 final_start, final_end;
+		u64 ei_end;
+
+		if (checktype && ei->type != old_type)
+			continue;
+
+		ei_end = ei->addr + ei->size;
+		/* totally covered? */
+		if (ei->addr >= start && ei_end <= end) {
+			real_removed_size += ei->size;
+			memset(ei, 0, sizeof(struct e820entry));
+			continue;
+		}
+
+		/* new range is totally covered? */
+		if (ei->addr < start && ei_end > end) {
+			e820_add_region(end, ei_end - end, ei->type);
+			ei->size = start - ei->addr;
+			real_removed_size += size;
+			continue;
+		}
+
+		/* partially covered */
+		final_start = max(start, ei->addr);
+		final_end = min(end, ei_end);
+		if (final_start >= final_end)
+			continue;
+		real_removed_size += final_end - final_start;
+
+		/*
+		 * left range could be head or tail, so need to update
+		 * size at first.
+		 */
+		ei->size -= final_end - final_start;
+		if (ei->addr < final_start)
+			continue;
+		ei->addr = final_end;
+	}
+	return real_removed_size;
+}
+
+void __init update_e820(void)
+{
+	u32 nr_map;
+
+	nr_map = e820.nr_map;
+	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
+		return;
+	e820.nr_map = nr_map;
+	printk(KERN_INFO "modified physical RAM map:\n");
+	_e820_print_map(&e820, "modified");
+}
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
+static void __init update_e820_saved(void)
+{
+	u32 nr_map;
+
+	nr_map = e820_saved.nr_map;
+	if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
+		return;
+	e820_saved.nr_map = nr_map;
+}
+#endif
+
+#ifdef CONFIG_XEN
+#define e820 machine_e820
+#endif
+
+#define MAX_GAP_END 0x100000000ull
+/*
+ * Search for a gap in the e820 memory space from start_addr to end_addr.
+ */
+__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
+		unsigned long start_addr, unsigned long long end_addr)
+{
+	unsigned long long last;
+	int i = e820.nr_map;
+	int found = 0;
+
+	last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
+#ifdef CONFIG_X86_64
+	if (start_addr >= MAX_GAP_END)
+		last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
+#endif
+
+	while (--i >= 0) {
+		unsigned long long start = e820.map[i].addr;
+		unsigned long long end = start + e820.map[i].size;
+
+		if (end < start_addr)
+			continue;
+
+		/*
+		 * Since "last" is at most 4GB, we know we'll
+		 * fit in 32 bits if this condition is true
+		 */
+		if (last > end) {
+			unsigned long gap = last - end;
+
+			if (gap >= *gapsize) {
+				*gapsize = gap;
+				*gapstart = end;
+				found = 1;
+			}
+		}
+		if (start < last)
+			last = start;
+	}
+	return found;
+}
+
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space.  We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(void)
+{
+	unsigned long gapstart, gapsize;
+	int found;
+
+	gapstart = 0x10000000;
+	gapsize = 0x400000;
+	found  = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
+
+#ifdef CONFIG_X86_64
+	if (!found) {
+		printk(KERN_ERR
+	"PCI: Warning: Cannot find a gap in the 32bit address range\n"
+	"PCI: Unassigned devices with 32bit resource registers may break!\n");
+		found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
+		WARN_ON(!found);
+	}
+#endif
+
+	/*
+	 * e820_reserve_resources_late protect stolen RAM already
+	 */
+	pci_mem_start = gapstart;
+
+	printk(KERN_INFO
+	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+	       pci_mem_start, gapstart, gapsize);
+}
+
+#undef e820
+
+#ifndef CONFIG_XEN
+/**
+ * Because of the size limitation of struct boot_params, only first
+ * 128 E820 memory entries are passed to kernel via
+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
+ * linked list of struct setup_data, which is parsed here.
+ */
+void __init parse_e820_ext(struct setup_data *sdata)
+{
+	int entries;
+	struct e820entry *extmap;
+
+	entries = sdata->len / sizeof(struct e820entry);
+	extmap = (struct e820entry *)(sdata->data);
+	__append_e820_map(extmap, entries);
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	printk(KERN_INFO "extended physical RAM map:\n");
+	_e820_print_map(&e820, "extended");
+}
+
+#if defined(CONFIG_X86_64) || \
+	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+/**
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+	int i;
+	unsigned long pfn;
+
+	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+	for (i = 1; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (pfn < PFN_UP(ei->addr))
+			register_nosave_region(pfn, PFN_UP(ei->addr));
+
+		pfn = PFN_DOWN(ei->addr + ei->size);
+		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+			register_nosave_region(PFN_UP(ei->addr), pfn);
+
+		if (pfn >= limit_pfn)
+			break;
+	}
+}
+#endif
+
+#ifdef CONFIG_ACPI
+/**
+ * Mark ACPI NVS memory region, so that we can save/restore it during
+ * hibernation and the subsequent resume.
+ */
+static int __init e820_mark_nvs_memory(void)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (ei->type == E820_NVS)
+			acpi_nvs_register(ei->addr, ei->size);
+	}
+
+	return 0;
+}
+core_initcall(e820_mark_nvs_memory);
+#endif
+#endif
+
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
+/*
+ * pre allocated 4k and reserved it in memblock and e820_saved
+ */
+u64 __init early_reserve_e820(u64 size, u64 align)
+{
+	u64 addr;
+#ifdef CONFIG_XEN
+	unsigned int order = get_order(size);
+	int rc;
+	unsigned long max_initmap_pfn;
+
+	if (!is_initial_xendomain())
+		return 0;
+	size = PAGE_SIZE << order;
+	if (align < PAGE_SIZE)
+		align = PAGE_SIZE;
+#endif
+	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+	if (addr) {
+		e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
+		printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+		update_e820_saved();
+	}
+#ifdef CONFIG_XEN
+	else
+		return 0;
+	max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
+				       + xen_start_info->nr_pt_frames
+				       + 1 + (1 << (19 - PAGE_SHIFT)),
+				1UL << (22 - PAGE_SHIFT));
+#ifdef CONFIG_X86_32
+	if ((addr >> PAGE_SHIFT)
+	    < max(max_initmap_pfn, max_pfn_mapped))
+		rc = xen_create_contiguous_region((unsigned long)__va(addr),
+						  order, 32);
+#else
+	if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
+		rc = xen_create_contiguous_region((unsigned long)__va(addr),
+						  order, 32);
+	else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
+		rc = xen_create_contiguous_region(__START_KERNEL_map + addr,
+						  order, 32);
+#endif
+	else
+		rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
+						    order, 32);
+	if (rc)
+		return 0;
+#endif
+
+	return addr;
+}
+#endif
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+#  define MAX_ARCH_PFN		(1ULL<<(40-PAGE_SHIFT))
+# else
+#  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
+#endif
+
+/*
+ * Find the highest page frame number we have available
+ */
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+{
+	int i;
+	unsigned long last_pfn = 0;
+	unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		unsigned long start_pfn;
+		unsigned long end_pfn;
+
+		if (ei->type != type)
+			continue;
+
+		start_pfn = ei->addr >> PAGE_SHIFT;
+		end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+
+		if (start_pfn >= limit_pfn)
+			continue;
+		if (end_pfn > limit_pfn) {
+			last_pfn = limit_pfn;
+			break;
+		}
+		if (end_pfn > last_pfn)
+			last_pfn = end_pfn;
+	}
+
+	if (last_pfn > max_arch_pfn)
+		last_pfn = max_arch_pfn;
+
+	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+			 last_pfn, max_arch_pfn);
+	return last_pfn;
+}
+unsigned long __init e820_end_of_ram_pfn(void)
+{
+	return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+}
+
+unsigned long __init e820_end_of_low_ram_pfn(void)
+{
+	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+}
+
+static void early_panic(char *msg)
+{
+	early_printk(msg);
+	panic(msg);
+}
+
+static int userdef __initdata;
+
+/* "mem=nopentium" disables the 4MB page tables. */
+static int __init parse_memopt(char *p)
+{
+	u64 mem_size, current_end;
+	unsigned int i;
+
+	if (!p)
+		return -EINVAL;
+
+#ifndef CONFIG_XEN
+	if (!strcmp(p, "nopentium")) {
+#ifdef CONFIG_X86_32
+		setup_clear_cpu_cap(X86_FEATURE_PSE);
+		return 0;
+#else
+		printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
+		return -EINVAL;
+#endif
+	}
+#endif
+
+	userdef = 1;
+	mem_size = memparse(p, &p);
+	/* don't remove all of memory when handling "mem={invalid}" param */
+	if (mem_size == 0)
+		return -EINVAL;
+#ifdef CONFIG_XEN
+	/*
+	 * A little less than 2% of available memory are needed for page
+	 * tables, p2m map, and mem_map. Hence the maximum amount of memory
+	 * we can potentially balloon up to can in no case exceed about 50
+	 * times of what we've been given initially. Since even with that we
+	 * won't be able to boot (due to various calculations done based on
+	 * the total number of pages) we further restrict this to factor 32.
+	 */
+	if ((mem_size >> (PAGE_SHIFT + 5)) > xen_start_info->nr_pages) {
+		u64 size = (u64)xen_start_info->nr_pages << 5;
+
+		pr_warn("mem=%Luk is invalid for an initial"
+			" allocation of %luk, using %Luk\n",
+			(unsigned long long)mem_size >> 10,
+			xen_start_info->nr_pages << (PAGE_SHIFT - 10),
+			(unsigned long long)size << (PAGE_SHIFT - 10));
+		mem_size = size << PAGE_SHIFT;
+	}
+#endif
+	e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+	i = e820.nr_map - 1;
+	current_end = e820.map[i].addr + e820.map[i].size;
+	if (current_end < mem_size) {
+		/*
+		 * The e820 map ends before our requested size so
+		 * extend the final entry to the requested address.
+		 */
+		if (e820.map[i].type == E820_RAM)
+			e820.map[i].size = mem_size - e820.map[i].addr;
+		else
+			e820_add_region(current_end, mem_size - current_end, E820_RAM);
+	}
+
+	return 0;
+}
+early_param("mem", parse_memopt);
+
+#ifndef CONFIG_XEN
+static int __init parse_memmap_opt(char *p)
+{
+	char *oldp;
+	u64 start_at, mem_size;
+
+	if (!p)
+		return -EINVAL;
+
+	if (!strncmp(p, "exactmap", 8)) {
+#ifdef CONFIG_CRASH_DUMP
+		/*
+		 * If we are doing a crash dump, we still need to know
+		 * the real mem size before original memory map is
+		 * reset.
+		 */
+		saved_max_pfn = e820_end_of_ram_pfn();
+#endif
+		e820.nr_map = 0;
+		userdef = 1;
+		return 0;
+	}
+
+	oldp = p;
+	mem_size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
+
+	userdef = 1;
+	if (*p == '@') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_RAM);
+	} else if (*p == '#') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_ACPI);
+	} else if (*p == '$') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_RESERVED);
+	} else
+		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+	return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+#endif
+
+void __init finish_e820_parsing(void)
+{
+	if (userdef) {
+		u32 nr = e820.nr_map;
+
+		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+			early_panic("Invalid user supplied memory map");
+		e820.nr_map = nr;
+
+		printk(KERN_INFO "user-defined physical RAM map:\n");
+		_e820_print_map(&e820, "user");
+	}
+}
+
+static inline const char *e820_type_to_string(int e820_type)
+{
+	switch (e820_type) {
+	case E820_RESERVED_KERN:
+	case E820_RAM:	return "System RAM";
+	case E820_ACPI:	return "ACPI Tables";
+	case E820_NVS:	return "ACPI Non-volatile Storage";
+	case E820_UNUSABLE:	return "Unusable memory";
+	default:	return "reserved";
+	}
+}
+
+#ifdef CONFIG_XEN
+#define e820 machine_e820
+#endif
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+static struct resource __initdata *e820_res;
+void __init e820_reserve_resources(void)
+{
+	int i;
+	struct resource *res;
+	u64 end;
+
+	res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
+	e820_res = res;
+	for (i = 0; i < e820.nr_map; i++) {
+		end = e820.map[i].addr + e820.map[i].size - 1;
+		if (end != (resource_size_t)end) {
+			res++;
+			continue;
+		}
+		res->name = e820_type_to_string(e820.map[i].type);
+		res->start = e820.map[i].addr;
+		res->end = end;
+
+		res->flags = IORESOURCE_MEM;
+
+		/*
+		 * don't register the region that could be conflicted with
+		 * pci device BAR resource and insert them later in
+		 * pcibios_resource_survey()
+		 */
+		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+			if (e820.map[i].type != E820_NVS)
+				res->flags |= IORESOURCE_BUSY;
+			insert_resource(&iomem_resource, res);
+		}
+		res++;
+	}
+
+	for (i = 0; i < e820_saved.nr_map; i++) {
+		struct e820entry *entry = &e820_saved.map[i];
+		firmware_map_add_early(entry->addr,
+			entry->addr + entry->size - 1,
+			e820_type_to_string(entry->type));
+	}
+}
+
+/* How much should we pad RAM ending depending on where it is? */
+static unsigned long ram_alignment(resource_size_t pos)
+{
+	unsigned long mb = pos >> 20;
+
+	/* To 64kB in the first megabyte */
+	if (!mb)
+		return 64*1024;
+
+	/* To 1MB in the first 16MB */
+	if (mb < 16)
+		return 1024*1024;
+
+	/* To 64MB for anything above that */
+	return 64*1024*1024;
+}
+
+#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
+
+void __init e820_reserve_resources_late(void)
+{
+	int i;
+	struct resource *res;
+
+	res = e820_res;
+	for (i = 0; i < e820.nr_map; i++) {
+		if (!res->parent && res->end)
+			insert_resource_expand_to_fit(&iomem_resource, res);
+		res++;
+	}
+
+	/*
+	 * Try to bump up RAM regions to reasonable boundaries to
+	 * avoid stolen RAM:
+	 */
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *entry = &e820.map[i];
+		u64 start, end;
+
+		if (entry->type != E820_RAM)
+			continue;
+		start = entry->addr + entry->size;
+		end = round_up(start, ram_alignment(start)) - 1;
+		if (end > MAX_RESOURCE_SIZE)
+			end = MAX_RESOURCE_SIZE;
+		if (start >= end)
+			continue;
+		printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
+			       start, end);
+		reserve_region_with_split(&iomem_resource, start, end,
+					  "RAM buffer");
+	}
+}
+
+#undef e820
+
+char *__init default_machine_specific_memory_setup(void)
+{
+	int rc, nr_map;
+	unsigned long maxmem;
+	struct xen_memory_map memmap;
+	static struct e820entry __initdata map[E820MAX];
+
+	memmap.nr_entries = E820MAX;
+	set_xen_guest_handle(memmap.buffer, map);
+
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc == -ENOSYS) {
+		memmap.nr_entries = 1;
+		map[0].addr = 0ULL;
+		map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
+		/* 8MB slack (to balance backend allocations). */
+		map[0].size += 8ULL << 20;
+		map[0].type = E820_RAM;
+		rc = 0;
+	}
+	BUG_ON(rc);
+
+	nr_map = memmap.nr_entries;
+	sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
+
+	if (append_e820_map(map, nr_map) < 0)
+		BUG();
+
+#ifdef CONFIG_XEN
+	/* See the comment in parse_memopt(). */
+	for (maxmem = rc = 0; rc < e820.nr_map; ++rc)
+		if (e820.map[rc].type == E820_RAM)
+			maxmem += e820.map[rc].size >> PAGE_SHIFT;
+	if (is_initial_xendomain()) {
+		domid_t domid = DOMID_SELF;
+
+		rc = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
+		if (rc > 0 && maxmem > rc)
+			maxmem = rc;
+	}
+	if ((maxmem >> 5) > xen_start_info->nr_pages) {
+		unsigned long long size = (u64)xen_start_info->nr_pages << 5;
+
+		pr_warn("maxmem of %luM is invalid for an initial"
+			" allocation of %luM, using %LuM\n",
+			maxmem >> (20 - PAGE_SHIFT),
+			xen_start_info->nr_pages >> (20 - PAGE_SHIFT),
+			size >> (20 - PAGE_SHIFT));
+		size <<= PAGE_SHIFT;
+		e820_remove_range(size, ULLONG_MAX - size, E820_RAM, 1);
+	}
+
+	if (is_initial_xendomain()) {
+		memmap.nr_entries = E820MAX;
+		set_xen_guest_handle(memmap.buffer, machine_e820.map);
+
+		if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
+			BUG();
+		machine_e820.nr_map = memmap.nr_entries;
+	}
+#endif
+
+	return "Xen";
+}
+
+void __init setup_memory_map(void)
+{
+	char *who;
+
+	who = x86_init.resources.memory_setup();
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
+#ifdef CONFIG_XEN
+	if (is_initial_xendomain()) {
+		printk(KERN_INFO "Xen-provided machine memory map:\n");
+		_e820_print_map(&machine_e820, "BIOS");
+	} else
+#endif
+		memcpy(&e820_saved, &e820, sizeof(struct e820map));
+#endif
+	printk(KERN_INFO "Xen-provided physical RAM map:\n");
+	_e820_print_map(&e820, who);
+}
+
+void __init memblock_x86_fill(void)
+{
+	int i;
+	u64 end;
+
+	/*
+	 * EFI may have more than 128 entries
+	 * We are safe to enable resizing, beause memblock_x86_fill()
+	 * is rather later for x86
+	 */
+	memblock_allow_resize();
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		end = ei->addr + ei->size;
+		if (end != (resource_size_t)end)
+			continue;
+
+		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+			continue;
+
+		memblock_add(ei->addr, ei->size);
+	}
+
+#ifdef CONFIG_XEN
+	if (max_pfn > xen_start_info->nr_pages)
+		memblock_reserve(PFN_PHYS(xen_start_info->nr_pages),
+				 PFN_PHYS(max_pfn - xen_start_info->nr_pages));
+#endif
+
+	memblock_dump_all();
+}
+
+void __init memblock_find_dma_reserve(void)
+{
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+	u64 nr_pages = 0, nr_free_pages = 0;
+	unsigned long start_pfn, end_pfn;
+	phys_addr_t start, end;
+	int i;
+	u64 u;
+
+	/*
+	 * need to find out used area below MAX_DMA_PFN
+	 * need to use memblock to get free size in [0, MAX_DMA_PFN]
+	 * at first, and assume boot_mem will not take below MAX_DMA_PFN
+	 */
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+		start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
+		end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
+		nr_pages += end_pfn - start_pfn;
+	}
+
+	for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+		start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
+		end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
+		if (start_pfn < end_pfn)
+			nr_free_pages += end_pfn - start_pfn;
+	}
+
+	set_dma_reserve(nr_pages - nr_free_pages);
+#endif
+}
diff --git a/arch/x86/kernel/early_printk-xen.c b/arch/x86/kernel/early_printk-xen.c
new file mode 100644
index 0000000..ea02752
--- /dev/null
+++ b/arch/x86/kernel/early_printk-xen.c
@@ -0,0 +1,291 @@
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/screen_info.h>
+#include <linux/usb/ch9.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/fcntl.h>
+#include <asm/setup.h>
+#include <asm/pci-direct.h>
+#include <asm/fixmap.h>
+#include <asm/mrst.h>
+#include <asm/pgtable.h>
+#include <linux/usb/ehci_def.h>
+
+#ifndef CONFIG_XEN
+/* Simple VGA output */
+#define VGABASE		(__ISA_IO_base + 0xb8000)
+
+static int max_ypos = 25, max_xpos = 80;
+static int current_ypos = 25, current_xpos;
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+	char c;
+	int  i, k, j;
+
+	while ((c = *str++) != '\0' && n-- > 0) {
+		if (current_ypos >= max_ypos) {
+			/* scroll 1 line up */
+			for (k = 1, j = 0; k < max_ypos; k++, j++) {
+				for (i = 0; i < max_xpos; i++) {
+					writew(readw(VGABASE+2*(max_xpos*k+i)),
+					       VGABASE + 2*(max_xpos*j + i));
+				}
+			}
+			for (i = 0; i < max_xpos; i++)
+				writew(0x720, VGABASE + 2*(max_xpos*j + i));
+			current_ypos = max_ypos-1;
+		}
+#ifdef CONFIG_KGDB_KDB
+		if (c == '\b') {
+			if (current_xpos > 0)
+				current_xpos--;
+		} else if (c == '\r') {
+			current_xpos = 0;
+		} else
+#endif
+		if (c == '\n') {
+			current_xpos = 0;
+			current_ypos++;
+		} else if (c != '\r')  {
+			writew(((0x7 << 8) | (unsigned short) c),
+			       VGABASE + 2*(max_xpos*current_ypos +
+						current_xpos++));
+			if (current_xpos >= max_xpos) {
+				current_xpos = 0;
+				current_ypos++;
+			}
+		}
+	}
+}
+
+static struct console early_vga_console = {
+	.name =		"earlyvga",
+	.write =	early_vga_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
+
+static int early_serial_base = 0x3f8;  /* ttyS0 */
+
+#define XMTRDY          0x20
+
+#define DLAB		0x80
+
+#define TXR             0       /*  Transmit register (WRITE) */
+#define RXR             0       /*  Receive register  (READ)  */
+#define IER             1       /*  Interrupt Enable          */
+#define IIR             2       /*  Interrupt ID              */
+#define FCR             2       /*  FIFO control              */
+#define LCR             3       /*  Line control              */
+#define MCR             4       /*  Modem control             */
+#define LSR             5       /*  Line Status               */
+#define MSR             6       /*  Modem Status              */
+#define DLL             0       /*  Divisor Latch Low         */
+#define DLH             1       /*  Divisor latch High        */
+
+static int early_serial_putc(unsigned char ch)
+{
+	unsigned timeout = 0xffff;
+
+	while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+		cpu_relax();
+	outb(ch, early_serial_base + TXR);
+	return timeout ? 0 : -1;
+}
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+	while (*s && n-- > 0) {
+		if (*s == '\n')
+			early_serial_putc('\r');
+		early_serial_putc(*s);
+		s++;
+	}
+}
+
+#define DEFAULT_BAUD 9600
+
+static __init void early_serial_init(char *s)
+{
+	unsigned char c;
+	unsigned divisor;
+	unsigned baud = DEFAULT_BAUD;
+	char *e;
+
+	if (*s == ',')
+		++s;
+
+	if (*s) {
+		unsigned port;
+		if (!strncmp(s, "0x", 2)) {
+			early_serial_base = simple_strtoul(s, &e, 16);
+		} else {
+			static const int __initconst bases[] = { 0x3f8, 0x2f8 };
+
+			if (!strncmp(s, "ttyS", 4))
+				s += 4;
+			port = simple_strtoul(s, &e, 10);
+			if (port > 1 || s == e)
+				port = 0;
+			early_serial_base = bases[port];
+		}
+		s += strcspn(s, ",");
+		if (*s == ',')
+			s++;
+	}
+
+	outb(0x3, early_serial_base + LCR);	/* 8n1 */
+	outb(0, early_serial_base + IER);	/* no interrupt */
+	outb(0, early_serial_base + FCR);	/* no fifo */
+	outb(0x3, early_serial_base + MCR);	/* DTR + RTS */
+
+	if (*s) {
+		baud = simple_strtoul(s, &e, 0);
+		if (baud == 0 || s == e)
+			baud = DEFAULT_BAUD;
+	}
+
+	divisor = 115200 / baud;
+	c = inb(early_serial_base + LCR);
+	outb(c | DLAB, early_serial_base + LCR);
+	outb(divisor & 0xff, early_serial_base + DLL);
+	outb((divisor >> 8) & 0xff, early_serial_base + DLH);
+	outb(c & ~DLAB, early_serial_base + LCR);
+}
+
+#else /* CONFIG_XEN */
+
+static void
+early_serial_write(struct console *con, const char *s, unsigned count)
+{
+	int n;
+
+	while (count > 0) {
+		n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
+		if (n <= 0)
+			break;
+		count -= n;
+		s += n;
+	}
+} 
+
+static __init void early_serial_init(char *s)
+{
+}
+
+/*
+ * No early VGA console on Xen, as we do not have convenient ISA-space
+ * mappings. Someone should fix this for domain 0. For now, use fake serial.
+ */
+#define early_vga_console early_serial_console
+
+#endif
+
+static struct console early_serial_console = {
+	.name =		"earlyser",
+	.write =	early_serial_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+
+/* Direct interface for emergencies */
+static struct console *early_console = &early_vga_console;
+static int __initdata early_console_initialized;
+
+asmlinkage void early_printk(const char *fmt, ...)
+{
+	char buf[512];
+	int n;
+	va_list ap;
+
+	va_start(ap, fmt);
+	n = vscnprintf(buf, sizeof(buf), fmt, ap);
+	early_console->write(early_console, buf, n);
+	va_end(ap);
+}
+
+static inline void early_console_register(struct console *con, int keep_early)
+{
+	if (early_console->index != -1) {
+		printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
+		       con->name);
+		return;
+	}
+	early_console = con;
+	if (keep_early)
+		early_console->flags &= ~CON_BOOT;
+	else
+		early_console->flags |= CON_BOOT;
+	register_console(early_console);
+}
+
+static int __init setup_early_printk(char *buf)
+{
+	int keep;
+
+	if (!buf)
+		return 0;
+
+	if (early_console_initialized)
+		return 0;
+	early_console_initialized = 1;
+
+	keep = (strstr(buf, "keep") != NULL);
+
+	while (*buf != '\0') {
+		if (!strncmp(buf, "serial", 6)) {
+			buf += 6;
+			early_serial_init(buf);
+			early_console_register(&early_serial_console, keep);
+			if (!strncmp(buf, ",ttyS", 5))
+				buf += 5;
+		}
+		if (!strncmp(buf, "ttyS", 4)) {
+			early_serial_init(buf + 4);
+			early_console_register(&early_serial_console, keep);
+		}
+#ifndef CONFIG_XEN
+		if (!strncmp(buf, "vga", 3) &&
+		    boot_params.screen_info.orig_video_isVGA == 1) {
+			max_xpos = boot_params.screen_info.orig_video_cols;
+			max_ypos = boot_params.screen_info.orig_video_lines;
+			current_ypos = boot_params.screen_info.orig_y;
+#else
+		if (!strncmp(buf, "vga", 3) || !strncmp(buf, "xen", 3)) {
+#endif
+			early_console_register(&early_vga_console, keep);
+		}
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+		if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
+			early_console_register(&early_dbgp_console, keep);
+#endif
+#ifdef CONFIG_HVC_XEN
+		if (!strncmp(buf, "xen", 3))
+			early_console_register(&xenboot_console, keep);
+#endif
+#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
+		if (!strncmp(buf, "mrst", 4)) {
+			mrst_early_console_init();
+			early_console_register(&early_mrst_console, keep);
+		}
+
+		if (!strncmp(buf, "hsu", 3)) {
+			hsu_early_console_init(buf + 3);
+			early_console_register(&early_hsu_console, keep);
+		}
+#endif
+		buf++;
+	}
+	return 0;
+}
+
+early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/entry_32-xen.S b/arch/x86/kernel/entry_32-xen.S
new file mode 100644
index 0000000..768399e
--- /dev/null
+++ b/arch/x86/kernel/entry_32-xen.S
@@ -0,0 +1,1722 @@
+/*
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ * This also contains the timer-interrupt handler, as well as all interrupts
+ * and faults that can result in a task-switch.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after a timer-interrupt and after each system call.
+ *
+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
+ * on a 486.
+ *
+ * Stack layout in 'syscall_exit':
+ * 	ptrace needs to have all regs on the stack.
+ *	if the order here is changed, it needs to be
+ *	updated in fork.c:copy_process, signal.c:do_signal,
+ *	ptrace.c and ptrace.h
+ *
+ *	 0(%esp) - %ebx
+ *	 4(%esp) - %ecx
+ *	 8(%esp) - %edx
+ *       C(%esp) - %esi
+ *	10(%esp) - %edi
+ *	14(%esp) - %ebp
+ *	18(%esp) - %eax
+ *	1C(%esp) - %ds
+ *	20(%esp) - %es
+ *	24(%esp) - %fs
+ *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
+ *	2C(%esp) - orig_eax
+ *	30(%esp) - %eip
+ *	34(%esp) - %cs
+ *	38(%esp) - %eflags
+ *	3C(%esp) - %oldesp
+ *	40(%esp) - %oldss
+ *
+ * "current" is in register %ebx during any slow entries.
+ */
+
+#include <linux/linkage.h>
+#include <linux/err.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/dwarf2.h>
+#include <asm/processor-flags.h>
+#include <asm/ftrace.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+#include <xen/interface/xen.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE	   0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+#define sysenter_audit	syscall_trace_entry
+#define sysexit_audit	syscall_exit_work
+#endif
+
+	.section .entry.text, "ax"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+/* Pseudo-eflags. */
+NMI_MASK	= 0x80000000
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+#define preempt_stop(clobbers)
+#define resume_kernel		restore_all
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)     # interrupts off?
+	jz 1f
+	TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+#ifdef CONFIG_VM86
+#define resume_userspace_sig	check_userspace
+#else
+#define resume_userspace_sig	resume_userspace
+#endif
+
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc.  Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+	pushl_cfi $0
+.endm
+.macro POP_GS pop=0
+	addl $(4 + \pop), %esp
+	CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else	/* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+	pushl_cfi %gs
+	/*CFI_REL_OFFSET gs, 0*/
+.endm
+
+.macro POP_GS pop=0
+98:	popl_cfi %gs
+	/*CFI_RESTORE gs*/
+  .if \pop <> 0
+	add $\pop, %esp
+	CFI_ADJUST_CFA_OFFSET -\pop
+  .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99:	movl $0, (%esp)
+	jmp 98b
+.section __ex_table, "a"
+	.align 4
+	.long 98b, 99b
+.popsection
+.endm
+
+.macro PTGS_TO_GS
+98:	mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99:	movl $0, PT_GS(%esp)
+	jmp 98b
+.section __ex_table, "a"
+	.align 4
+	.long 98b, 99b
+.popsection
+.endm
+
+.macro GS_TO_REG reg
+	movl %gs, \reg
+	/*CFI_REGISTER gs, \reg*/
+.endm
+.macro REG_TO_PTGS reg
+	movl \reg, PT_GS(%esp)
+	/*CFI_REL_OFFSET gs, PT_GS*/
+.endm
+.macro SET_KERNEL_GS reg
+	movl $(__KERNEL_STACK_CANARY), \reg
+	movl \reg, %gs
+.endm
+
+#endif	/* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+	cld
+	PUSH_GS
+	pushl_cfi %fs
+	/*CFI_REL_OFFSET fs, 0;*/
+	pushl_cfi %es
+	/*CFI_REL_OFFSET es, 0;*/
+	pushl_cfi %ds
+	/*CFI_REL_OFFSET ds, 0;*/
+	pushl_cfi %eax
+	CFI_REL_OFFSET eax, 0
+	pushl_cfi %ebp
+	CFI_REL_OFFSET ebp, 0
+	pushl_cfi %edi
+	CFI_REL_OFFSET edi, 0
+	pushl_cfi %esi
+	CFI_REL_OFFSET esi, 0
+	pushl_cfi %edx
+	CFI_REL_OFFSET edx, 0
+	pushl_cfi %ecx
+	CFI_REL_OFFSET ecx, 0
+	pushl_cfi %ebx
+	CFI_REL_OFFSET ebx, 0
+	movl $(__USER_DS), %edx
+	movl %edx, %ds
+	movl %edx, %es
+	movl $(__KERNEL_PERCPU), %edx
+	movl %edx, %fs
+	SET_KERNEL_GS %edx
+.endm
+
+.macro RESTORE_INT_REGS
+	popl_cfi %ebx
+	CFI_RESTORE ebx
+	popl_cfi %ecx
+	CFI_RESTORE ecx
+	popl_cfi %edx
+	CFI_RESTORE edx
+	popl_cfi %esi
+	CFI_RESTORE esi
+	popl_cfi %edi
+	CFI_RESTORE edi
+	popl_cfi %ebp
+	CFI_RESTORE ebp
+	popl_cfi %eax
+	CFI_RESTORE eax
+.endm
+
+.macro RESTORE_REGS pop=0
+	RESTORE_INT_REGS
+1:	popl_cfi %ds
+	/*CFI_RESTORE ds;*/
+2:	popl_cfi %es
+	/*CFI_RESTORE es;*/
+3:	popl_cfi %fs
+	/*CFI_RESTORE fs;*/
+	POP_GS \pop
+.pushsection .fixup, "ax"
+4:	movl $0, (%esp)
+	jmp 1b
+5:	movl $0, (%esp)
+	jmp 2b
+6:	movl $0, (%esp)
+	jmp 3b
+.section __ex_table, "a"
+	.align 4
+	.long 1b, 4b
+	.long 2b, 5b
+	.long 3b, 6b
+.popsection
+	POP_GS_EX
+.endm
+
+.macro RING0_INT_FRAME
+	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA esp, 3*4
+	/*CFI_OFFSET cs, -2*4;*/
+	CFI_OFFSET eip, -3*4
+.endm
+
+.macro RING0_EC_FRAME
+	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA esp, 4*4
+	/*CFI_OFFSET cs, -2*4;*/
+	CFI_OFFSET eip, -3*4
+.endm
+
+.macro RING0_PTREGS_FRAME
+	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
+	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
+	CFI_OFFSET eip, PT_EIP-PT_OLDESP
+	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
+	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
+	CFI_OFFSET eax, PT_EAX-PT_OLDESP
+	CFI_OFFSET ebp, PT_EBP-PT_OLDESP
+	CFI_OFFSET edi, PT_EDI-PT_OLDESP
+	CFI_OFFSET esi, PT_ESI-PT_OLDESP
+	CFI_OFFSET edx, PT_EDX-PT_OLDESP
+	CFI_OFFSET ecx, PT_ECX-PT_OLDESP
+	CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+.endm
+
+ENTRY(ret_from_fork)
+	CFI_STARTPROC
+	pushl_cfi %eax
+	call schedule_tail
+	GET_THREAD_INFO(%ebp)
+	popl_cfi %eax
+	pushl_cfi $0x0202		# Reset kernel eflags
+	popfl_cfi
+	jmp syscall_exit
+	CFI_ENDPROC
+END(ret_from_fork)
+
+/*
+ * Interrupt exit functions should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+	# userspace resumption stub bypassing syscall exit tracing
+	ALIGN
+	RING0_PTREGS_FRAME
+ret_from_exception:
+	preempt_stop(CLBR_ANY)
+ret_from_intr:
+	GET_THREAD_INFO(%ebp)
+check_userspace:
+	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
+	movb PT_CS(%esp), %al
+	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+	cmpl $USER_RPL, %eax
+	jb resume_kernel		# not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+	LOCKDEP_SYS_EXIT
+ 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done on
+					# int/exception return?
+	jne work_pending
+	jmp restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
+	jnz restore_all
+need_resched:
+	movl TI_flags(%ebp), %ecx	# need_resched set ?
+	testb $_TIF_NEED_RESCHED, %cl
+	jz restore_all
+	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+	jz restore_all
+	call preempt_schedule_irq
+	jmp need_resched
+END(resume_kernel)
+#endif
+	CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+	.popsection
+
+/* SYSENTER_RETURN points to after the "sysenter" instruction in
+   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
+
+	# sysenter call handler stub
+ENTRY(ia32_sysenter_target)
+	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA esp, 0
+	CFI_REGISTER esp, ebp
+	movl SYSENTER_stack_sp0(%esp),%esp
+sysenter_past_esp:
+	/*
+	 * Interrupts are disabled here, but we can't trace it until
+	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
+	 * we immediately enable interrupts at that point anyway.
+	 */
+	pushl_cfi $__USER_DS
+	/*CFI_REL_OFFSET ss, 0*/
+	pushl_cfi %ebp
+	CFI_REL_OFFSET esp, 0
+	pushfl_cfi
+	orl $X86_EFLAGS_IF, (%esp)
+	pushl_cfi $__USER_CS
+	/*CFI_REL_OFFSET cs, 0*/
+	/*
+	 * Push current_thread_info()->sysenter_return to the stack.
+	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
+	 */
+	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+	CFI_REL_OFFSET eip, 0
+
+	pushl_cfi %eax
+	SAVE_ALL
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+	cmpl $__PAGE_OFFSET-3,%ebp
+	jae syscall_fault
+1:	movl (%ebp),%ebp
+	movl %ebp,PT_EBP(%esp)
+.section __ex_table,"a"
+	.align 4
+	.long 1b,syscall_fault
+.previous
+
+	GET_THREAD_INFO(%ebp)
+
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	jnz sysenter_audit
+sysenter_do_call:
+	cmpl $(NR_syscalls), %eax
+	jae syscall_badsys
+	call *sys_call_table(,%eax,4)
+	movl %eax,PT_EAX(%esp)
+	LOCKDEP_SYS_EXIT
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	testl $_TIF_ALLWORK_MASK, %ecx
+	jne sysexit_audit
+sysenter_exit:
+/* if something modifies registers it must also disable sysexit */
+	movl PT_EIP(%esp), %edx
+	movl PT_OLDESP(%esp), %ecx
+	xorl %ebp,%ebp
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+	GET_VCPU_INFO
+#endif
+	TRACE_IRQS_ON
+1:	mov  PT_FS(%esp), %fs
+	PTGS_TO_GS
+	ENABLE_INTERRUPTS_SYSEXIT
+
+#ifdef CONFIG_AUDITSYSCALL
+sysenter_audit:
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	jnz syscall_trace_entry
+	addl $4,%esp
+	CFI_ADJUST_CFA_OFFSET -4
+	/* %esi already in 8(%esp)	   6th arg: 4th syscall arg */
+	/* %edx already in 4(%esp)	   5th arg: 3rd syscall arg */
+	/* %ecx already in 0(%esp)	   4th arg: 2nd syscall arg */
+	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
+	movl %eax,%edx			/* 2nd arg: syscall number */
+	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
+	call __audit_syscall_entry
+	pushl_cfi %ebx
+	movl PT_EAX(%esp),%eax		/* reload syscall number */
+	jmp sysenter_do_call
+
+sysexit_audit:
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+	jne syscall_exit_work
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_ANY)
+	movl %eax,%edx		/* second arg, syscall return value */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
+	movzbl %al,%eax		/* zero-extend that */
+	call __audit_syscall_exit
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+	jne syscall_exit_work
+	movl PT_EAX(%esp),%eax	/* reload syscall return value */
+	jmp sysenter_exit
+#endif
+
+	CFI_ENDPROC
+.pushsection .fixup,"ax"
+2:	movl $0,PT_FS(%esp)
+	jmp 1b
+.section __ex_table,"a"
+	.align 4
+	.long 1b,2b
+.popsection
+	PTGS_TO_GS_EX
+ENDPROC(ia32_sysenter_target)
+
+	# pv sysenter call handler stub
+ENTRY(ia32pv_sysenter_target)
+	RING0_INT_FRAME
+	movl $__USER_DS,16(%esp)
+	movl %ebp,12(%esp)
+	movl $__USER_CS,4(%esp)
+	addl $4,%esp
+	CFI_ADJUST_CFA_OFFSET -4
+	/* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
+	pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+	cmpl $__PAGE_OFFSET-3,%ebp
+	jae syscall_fault
+1:	movl (%ebp),%ebp
+.section __ex_table,"a"
+	.align 4
+	.long 1b,syscall_fault
+.previous
+	jmp system_call
+	CFI_ENDPROC
+ENDPROC(ia32pv_sysenter_target)
+
+/*
+ * syscall stub including irq exit should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
+	# system call handler stub
+ENTRY(system_call)
+	RING0_INT_FRAME			# can't unwind into user space anyway
+	pushl_cfi %eax			# save orig_eax
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+					# system call tracing in operation / emulation
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	jnz syscall_trace_entry
+	cmpl $(NR_syscalls), %eax
+	jae syscall_badsys
+syscall_call:
+	call *sys_call_table(,%eax,4)
+	movl %eax,PT_EAX(%esp)		# store the return value
+syscall_exit:
+	LOCKDEP_SYS_EXIT
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
+	jne syscall_exit_work
+
+restore_all:
+	TRACE_IRQS_IRET
+restore_all_notrace:
+#ifndef CONFIG_XEN
+	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS
+	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+	# are returning to the kernel.
+	# See comments in process.c:copy_thread() for details.
+	movb PT_OLDSS(%esp), %ah
+	movb PT_CS(%esp), %al
+	andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+	CFI_REMEMBER_STATE
+	je ldt_ss			# returning to user-space with LDT SS
+restore_nocheck:
+#else
+restore_nocheck:
+	movl PT_EFLAGS(%esp), %eax
+	testl $(X86_EFLAGS_VM|NMI_MASK), %eax
+	CFI_REMEMBER_STATE
+	jnz hypervisor_iret
+	shr $9, %eax			# EAX[0] == IRET_EFLAGS.IF
+	GET_VCPU_INFO
+	andb evtchn_upcall_mask(%esi),%al
+	andb $1,%al			# EAX[0] == IRET_EFLAGS.IF & event_mask
+	CFI_REMEMBER_STATE
+	jnz restore_all_enable_events	#        != 0 => enable event delivery
+#endif
+	RESTORE_REGS 4			# skip orig_eax/error_code
+irq_return:
+	INTERRUPT_RETURN
+.section .fixup,"ax"
+ENTRY(iret_exc)
+	pushl $0			# no error code
+	pushl $do_iret_error
+	jmp error_code
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long irq_return,iret_exc
+.previous
+
+	CFI_RESTORE_STATE
+#ifndef CONFIG_XEN
+ldt_ss:
+	larl PT_OLDSS(%esp), %eax
+	jnz restore_nocheck
+	testl $0x00400000, %eax		# returning to 32bit stack?
+	jnz restore_nocheck		# allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+	/*
+	 * The kernel can't run on a non-flat stack if paravirt mode
+	 * is active.  Rather than try to fixup the high bits of
+	 * ESP, bypass this code entirely.  This may break DOSemu
+	 * and/or Wine support in a paravirt VM, although the option
+	 * is still available to implement the setting of the high
+	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
+	 */
+	cmpl $0, pv_info+PARAVIRT_enabled
+	jne restore_nocheck
+#endif
+
+/*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+	mov %esp, %edx			/* load kernel esp */
+	mov PT_OLDESP(%esp), %eax	/* load userspace esp */
+	mov %dx, %ax			/* eax: new kernel esp */
+	sub %eax, %edx			/* offset (low word is 0) */
+	shr $16, %edx
+	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+	pushl_cfi $__ESPFIX_SS
+	pushl_cfi %eax			/* new kernel esp */
+	/* Disable interrupts, but do not irqtrace this section: we
+	 * will soon execute iret and the tracer was already set to
+	 * the irqstate after the iret */
+	DISABLE_INTERRUPTS(CLBR_EAX)
+	lss (%esp), %esp		/* switch to espfix segment */
+	CFI_ADJUST_CFA_OFFSET -8
+	jmp restore_nocheck
+#else
+        ALIGN
+restore_all_enable_events:
+	TRACE_IRQS_ON
+	__ENABLE_INTERRUPTS
+scrit:	/**** START OF CRITICAL REGION ****/
+	__TEST_PENDING
+	jnz  14f			# process more events if necessary...
+	RESTORE_REGS 4
+1:	INTERRUPT_RETURN
+.section __ex_table,"a"
+	.align 4
+	.long 1b,iret_exc
+.previous
+14:	__DISABLE_INTERRUPTS
+	TRACE_IRQS_OFF
+ecrit:  /**** END OF CRITICAL REGION ****/
+	jmp  .Ldo_upcall
+
+	CFI_RESTORE_STATE
+hypervisor_iret:
+	andl $~NMI_MASK, PT_EFLAGS(%esp)
+	RESTORE_REGS 4
+	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
+#endif
+	CFI_ENDPROC
+ENDPROC(system_call)
+
+	# perform work that needs to be done immediately before resumption
+	ALIGN
+	RING0_PTREGS_FRAME		# can't unwind into user space anyway
+work_pending:
+	testb $_TIF_NEED_RESCHED, %cl
+	jz work_notifysig
+work_resched:
+	call schedule
+	LOCKDEP_SYS_EXIT
+	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
+					# than syscall tracing?
+	jz restore_all
+	testb $_TIF_NEED_RESCHED, %cl
+	jnz work_resched
+
+work_notifysig:				# deal with pending signals and
+					# notify-resume requests
+#ifdef CONFIG_VM86
+	testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
+	movl %esp, %eax
+	jne work_notifysig_v86		# returning to kernel-space or
+					# vm86-space
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	xorl %edx, %edx
+	call do_notify_resume
+	jmp resume_userspace_sig
+
+	ALIGN
+work_notifysig_v86:
+	pushl_cfi %ecx			# save ti_flags for do_notify_resume
+	call save_v86_state		# %eax contains pt_regs pointer
+	popl_cfi %ecx
+	movl %eax, %esp
+#else
+	movl %esp, %eax
+#endif
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	xorl %edx, %edx
+	call do_notify_resume
+	jmp resume_userspace_sig
+END(work_pending)
+
+	# perform syscall exit tracing
+	ALIGN
+syscall_trace_entry:
+	movl $-ENOSYS,PT_EAX(%esp)
+	movl %esp, %eax
+	call syscall_trace_enter
+	/* What it returned is what we'll actually use.  */
+	cmpl $(NR_syscalls), %eax
+	jnae syscall_call
+	jmp syscall_exit
+END(syscall_trace_entry)
+
+	# perform syscall exit tracing
+	ALIGN
+syscall_exit_work:
+	testl $_TIF_WORK_SYSCALL_EXIT, %ecx
+	jz work_pending
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
+					# schedule() instead
+	movl %esp, %eax
+	call syscall_trace_leave
+	jmp resume_userspace
+END(syscall_exit_work)
+	CFI_ENDPROC
+
+	RING0_INT_FRAME			# can't unwind into user space anyway
+syscall_fault:
+	GET_THREAD_INFO(%ebp)
+	movl $-EFAULT,PT_EAX(%esp)
+	jmp resume_userspace
+END(syscall_fault)
+
+syscall_badsys:
+	movl $-ENOSYS,PT_EAX(%esp)
+	jmp resume_userspace
+END(syscall_badsys)
+	CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+	.popsection
+
+/*
+ * System calls that need a pt_regs pointer.
+ */
+#define PTREGSCALL0(name) \
+ENTRY(ptregs_##name) ;  \
+	leal 4(%esp),%eax; \
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
+
+#define PTREGSCALL1(name) \
+ENTRY(ptregs_##name) ; \
+	leal 4(%esp),%edx; \
+	movl (PT_EBX+4)(%esp),%eax; \
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
+
+#define PTREGSCALL2(name) \
+ENTRY(ptregs_##name) ; \
+	leal 4(%esp),%ecx; \
+	movl (PT_ECX+4)(%esp),%edx; \
+	movl (PT_EBX+4)(%esp),%eax; \
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
+
+#define PTREGSCALL3(name) \
+ENTRY(ptregs_##name) ; \
+	CFI_STARTPROC; \
+	leal 4(%esp),%eax; \
+	pushl_cfi %eax; \
+	movl PT_EDX(%eax),%ecx; \
+	movl PT_ECX(%eax),%edx; \
+	movl PT_EBX(%eax),%eax; \
+	call sys_##name; \
+	addl $4,%esp; \
+	CFI_ADJUST_CFA_OFFSET -4; \
+	ret; \
+	CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
+
+PTREGSCALL1(iopl)
+PTREGSCALL0(fork)
+PTREGSCALL0(vfork)
+PTREGSCALL3(execve)
+PTREGSCALL2(sigaltstack)
+PTREGSCALL0(sigreturn)
+PTREGSCALL0(rt_sigreturn)
+PTREGSCALL2(vm86)
+PTREGSCALL1(vm86old)
+
+/* Clone is an oddball.  The 4th arg is in %edi */
+ENTRY(ptregs_clone)
+	CFI_STARTPROC
+	leal 4(%esp),%eax
+	pushl_cfi %eax
+	pushl_cfi PT_EDI(%eax)
+	movl PT_EDX(%eax),%ecx
+	movl PT_ECX(%eax),%edx
+	movl PT_EBX(%eax),%eax
+	call sys_clone
+	addl $8,%esp
+	CFI_ADJUST_CFA_OFFSET -8
+	ret
+	CFI_ENDPROC
+ENDPROC(ptregs_clone)
+
+#ifndef CONFIG_XEN
+.macro FIXUP_ESPFIX_STACK
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+	/* fixup the stack */
+	mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+	shl $16, %eax
+	addl %esp, %eax			/* the adjusted stack pointer */
+	pushl_cfi $__KERNEL_DS
+	pushl_cfi %eax
+	lss (%esp), %esp		/* switch to the normal stack segment */
+	CFI_ADJUST_CFA_OFFSET -8
+.endm
+.macro UNWIND_ESPFIX_STACK
+	movl %ss, %eax
+	/* see if on espfix stack */
+	cmpw $__ESPFIX_SS, %ax
+	jne 27f
+	movl $__KERNEL_DS, %eax
+	movl %eax, %ds
+	movl %eax, %es
+	/* switch to normal stack */
+	FIXUP_ESPFIX_STACK
+27:
+.endm
+
+/*
+ * Build the entry stubs and pointer table with some assembler magic.
+ * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ * single cache line on all modern x86 implementations.
+ */
+.section .init.rodata,"a"
+ENTRY(interrupt)
+.section .entry.text, "ax"
+	.p2align 5
+	.p2align CONFIG_X86_L1_CACHE_SHIFT
+ENTRY(irq_entries_start)
+	RING0_INT_FRAME
+vector=FIRST_EXTERNAL_VECTOR
+.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+	.balign 32
+  .rept	7
+    .if vector < NR_VECTORS
+      .if vector <> FIRST_EXTERNAL_VECTOR
+	CFI_ADJUST_CFA_OFFSET -4
+      .endif
+1:	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
+      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+	jmp 2f
+      .endif
+      .previous
+	.long 1b
+      .section .entry.text, "ax"
+vector=vector+1
+    .endif
+  .endr
+2:	jmp common_interrupt
+.endr
+END(irq_entries_start)
+
+.previous
+END(interrupt)
+.previous
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+	.p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+	addl $-0x80,(%esp)	/* Adjust vector into the [-256,-1] range */
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	movl %esp,%eax
+	call do_IRQ
+	jmp ret_from_intr
+ENDPROC(common_interrupt)
+	CFI_ENDPROC
+
+/*
+ *  Irq entries should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
+#define BUILD_INTERRUPT3(name, nr, fn)	\
+ENTRY(name)				\
+	RING0_INT_FRAME;		\
+	pushl_cfi $~(nr);		\
+	SAVE_ALL;			\
+	TRACE_IRQS_OFF			\
+	movl %esp,%eax;			\
+	call fn;			\
+	jmp ret_from_intr;		\
+	CFI_ENDPROC;			\
+ENDPROC(name)
+
+#define BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(name, nr, smp_##name)
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include <asm/entry_arch.h>
+
+#else
+#define UNWIND_ESPFIX_STACK
+
+	.pushsection .kprobes.text, "ax"
+
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+#
+# The sysexit critical region is slightly different. sysexit
+# atomically removes the entire stack frame. If we interrupt in the
+# critical region we know that the entire frame is present and correct
+# so we can simply throw away the new one.
+ENTRY(hypervisor_callback)
+	RING0_INT_FRAME
+	pushl_cfi %eax
+	SAVE_ALL
+	movl PT_CS(%esp),%ecx
+	movl PT_EIP(%esp),%eax
+	andl $SEGMENT_RPL_MASK,%ecx
+	cmpl $USER_RPL,%ecx
+	jae  .Ldo_upcall
+	cmpl $scrit,%eax
+	jb   0f
+	cmpl $ecrit,%eax
+	jb   critical_region_fixup
+0:
+#ifdef CONFIG_XEN_SUPERVISOR_MODE_KERNEL
+	cmpl $sysexit_scrit,%eax
+	jb   .Ldo_upcall
+	cmpl $sysexit_ecrit,%eax
+	ja   .Ldo_upcall
+	addl $PT_OLDESP,%esp		# Remove eflags...ebx from stack frame.
+#endif
+.Ldo_upcall:
+	pushl_cfi %esp
+	call evtchn_do_upcall
+	add  $4,%esp
+	CFI_ADJUST_CFA_OFFSET -4
+	jmp  ret_from_intr
+	CFI_ENDPROC
+
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame.
+critical_region_fixup:
+	movsbl critical_fixup_table-scrit(%eax),%ecx # %ecx contains num slots popped
+	testl %ecx,%ecx
+	leal (%esp,%ecx,4),%esi		# %esi points at end of src region
+	leal PT_OLDESP(%esp),%edi	# %edi points at end of dst region
+	jle   17f			# skip loop if nothing to copy
+16:	subl $4,%esi			# pre-decrementing copy loop
+	subl $4,%edi
+	movl (%esi),%eax
+	movl %eax,(%edi)
+	loop 16b
+17:	movl %edi,%esp			# final %edi is top of merged stack
+	jmp  .Ldo_upcall
+
+.section .rodata,"a"
+critical_fixup_table:
+	.rept __SIZEOF_TEST_PENDING
+	.byte -1
+	.endr
+	.byte -1,-1			# jnz  14f
+	.byte 0				# pop  %ebx
+	.byte 1				# pop  %ecx
+	.byte 2				# pop  %edx
+	.byte 3				# pop  %esi
+	.byte 4				# pop  %edi
+	.byte 5				# pop  %ebp
+	.byte 6				# pop  %eax
+	.byte 7				# pop  %ds
+	.byte 8				# pop  %es
+	.byte 9,9			# pop  %fs
+#ifndef CONFIG_X86_32_LAZY_GS
+	.byte 10,10			# pop  %gs
+	.byte 11,11,11			# add  $4,%esp
+#else
+	.byte 10,10,10			# add  $8,%esp
+#endif
+	.byte 12			# iret
+	.rept __SIZEOF_DISABLE_INTERRUPTS
+	.byte -1
+	.endr
+.previous
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(failsafe_callback)
+	pushl %eax
+	movl $1,%eax
+1:	mov 4(%esp),%ds
+2:	mov 8(%esp),%es
+3:	mov 12(%esp),%fs
+4:	mov 16(%esp),%gs
+	testl %eax,%eax
+	popl %eax
+	jz 5f
+	addl $16,%esp		# EAX != 0 => Category 2 (Bad IRET)
+	jmp iret_exc
+5:	addl $16,%esp		# EAX == 0 => Category 1 (Bad segment)
+	RING0_INT_FRAME
+	pushl $0
+	SAVE_ALL
+	jmp ret_from_exception
+.section .fixup,"ax";		\
+6:	xorl %eax,%eax;		\
+	movl %eax,4(%esp);	\
+	jmp 1b;			\
+7:	xorl %eax,%eax;		\
+	movl %eax,8(%esp);	\
+	jmp 2b;			\
+8:	xorl %eax,%eax;		\
+	movl %eax,12(%esp);	\
+	jmp 3b;			\
+9:	xorl %eax,%eax;		\
+	movl %eax,16(%esp);	\
+	jmp 4b;			\
+.previous;			\
+.section __ex_table,"a";	\
+	.align 4;		\
+	.long 1b,6b;		\
+	.long 2b,7b;		\
+	.long 3b,8b;		\
+	.long 4b,9b;		\
+.previous
+#endif
+	CFI_ENDPROC
+
+ENTRY(coprocessor_error)
+	RING0_INT_FRAME
+	pushl_cfi $0
+	pushl_cfi $do_coprocessor_error
+	jmp error_code
+	CFI_ENDPROC
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+	RING0_INT_FRAME
+	pushl_cfi $0
+#ifdef CONFIG_X86_INVD_BUG
+	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+661:	pushl_cfi $do_general_protection
+662:
+.section .altinstructions,"a"
+	altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
+.previous
+.section .altinstr_replacement,"ax"
+663:	pushl $do_simd_coprocessor_error
+664:
+.previous
+#else
+	pushl_cfi $do_simd_coprocessor_error
+#endif
+	jmp error_code
+	CFI_ENDPROC
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+	RING0_INT_FRAME
+	pushl_cfi $-1			# mark this as an int
+	pushl_cfi $do_device_not_available
+	jmp error_code
+	CFI_ENDPROC
+END(device_not_available)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+	iret
+.section __ex_table,"a"
+	.align 4
+	.long native_iret, iret_exc
+.previous
+END(native_iret)
+
+ENTRY(native_irq_enable_sysexit)
+	sti
+	sysexit
+END(native_irq_enable_sysexit)
+#endif
+
+ENTRY(overflow)
+	RING0_INT_FRAME
+	pushl_cfi $0
+	pushl_cfi $do_overflow
+	jmp error_code
+	CFI_ENDPROC
+END(overflow)
+
+ENTRY(bounds)
+	RING0_INT_FRAME
+	pushl_cfi $0
+	pushl_cfi $do_bounds
+	jmp error_code
+	CFI_ENDPROC
+END(bounds)
+
+ENTRY(invalid_op)
+	RING0_INT_FRAME
+	pushl_cfi $0
+	pushl_cfi $do_invalid_op
+	jmp error_code
+	CFI_ENDPROC
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+	RING0_INT_FRAME
+	pushl_cfi $0
+	pushl_cfi $do_coprocessor_segment_overrun
+	jmp error_code
+	CFI_ENDPROC
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+	RING0_EC_FRAME
+	pushl_cfi $do_invalid_TSS
+	jmp error_code
+	CFI_ENDPROC
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+	RING0_EC_FRAME
+	pushl_cfi $do_segment_not_present
+	jmp error_code
+	CFI_ENDPROC
+END(segment_not_present)
+
+ENTRY(stack_segment)
+	RING0_EC_FRAME
+	pushl_cfi $do_stack_segment
+	jmp error_code
+	CFI_ENDPROC
+END(stack_segment)
+
+ENTRY(alignment_check)
+	RING0_EC_FRAME
+	pushl_cfi $do_alignment_check
+	jmp error_code
+	CFI_ENDPROC
+END(alignment_check)
+
+ENTRY(divide_error)
+	RING0_INT_FRAME
+	pushl_cfi $0			# no error code
+	pushl_cfi $do_divide_error
+	jmp error_code
+	CFI_ENDPROC
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+	RING0_INT_FRAME
+	pushl_cfi $0
+	pushl_cfi machine_check_vector
+	jmp error_code
+	CFI_ENDPROC
+END(machine_check)
+#endif
+
+#ifndef CONFIG_XEN
+ENTRY(spurious_interrupt_bug)
+	RING0_INT_FRAME
+	pushl_cfi $0
+	pushl_cfi $do_spurious_interrupt_bug
+	jmp error_code
+	CFI_ENDPROC
+END(spurious_interrupt_bug)
+#endif /* !CONFIG_XEN */
+
+ENTRY(fixup_4gb_segment)
+	RING0_EC_FRAME
+	pushl_cfi $do_fixup_4gb_segment
+	jmp error_code
+	CFI_ENDPROC
+END(fixup_4gb_segment)
+/*
+ * End of kprobes section
+ */
+	.popsection
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+	CFI_STARTPROC
+	movl	4(%esp), %edx
+	movl	(%esp), %ecx
+	leal	4(%esp), %eax
+	movl	%ebx, PT_EBX(%edx)
+	xorl	%ebx, %ebx
+	movl	%ebx, PT_ECX(%edx)
+	movl	%ebx, PT_EDX(%edx)
+	movl	%esi, PT_ESI(%edx)
+	movl	%edi, PT_EDI(%edx)
+	movl	%ebp, PT_EBP(%edx)
+	movl	%ebx, PT_EAX(%edx)
+	movl	$__USER_DS, PT_DS(%edx)
+	movl	$__USER_DS, PT_ES(%edx)
+	movl	$__KERNEL_PERCPU, PT_FS(%edx)
+	movl	$__KERNEL_STACK_CANARY, PT_GS(%edx)
+	movl	%eax, PT_OLDESP(%edx)
+	movl	16(%esp), %eax
+	movl	%ebx, PT_ORIG_EAX(%edx)
+	movl	%ecx, PT_EIP(%edx)
+	movl	12(%esp), %ecx
+	movl	$__KERNEL_CS, PT_CS(%edx)
+	movl	%eax, 12(%esp)
+	movl	8(%esp), %eax
+	movl	%ecx, 8(%esp)
+	movl	%ebx, PT_EFLAGS(%edx)
+	movl	PT_EBX(%edx), %ebx
+	movl	$__KERNEL_DS, PT_OLDSS(%edx)
+	jmpl	*%eax
+	CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
+
+ENTRY(kernel_thread_helper)
+	pushl $0		# fake return address for unwinder
+	CFI_STARTPROC
+	movl %edi,%eax
+	call *%esi
+	call do_exit
+	ud2			# padding for call trace
+	CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+	ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp ftrace_stub
+#endif
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
+	cmpl $ftrace_stub, ftrace_trace_function
+	jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpl $ftrace_stub, ftrace_graph_return
+	jnz ftrace_graph_caller
+
+	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
+
+	call *ftrace_trace_function
+
+	popl %edx
+	popl %ecx
+	popl %eax
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %edx
+	lea 0x4(%ebp), %eax
+	movl (%ebp), %ecx
+	subl $MCOUNT_INSN_SIZE, %edx
+	call prepare_ftrace_return
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+	pushl %eax
+	pushl %edx
+	movl %ebp, %eax
+	call ftrace_return_to_handler
+	movl %eax, %ecx
+	popl %edx
+	popl %eax
+	jmp *%ecx
+#endif
+
+#ifdef TIF_CSTAR
+	# pv syscall call handler stub
+ENTRY(ia32pv_cstar_target)
+	RING0_INT_FRAME
+	movl $__USER_DS,16(%esp)
+	movl %ebp,%ecx
+	movl $__USER_CS,4(%esp)
+	movl 12(%esp),%ebp
+	pushl_cfi %eax			# save orig_eax
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+	cmpl $__PAGE_OFFSET-4,%ebp
+	CFI_REMEMBER_STATE
+	ja cstar_fault
+1:	movl (%ebp),%ebp
+.section __ex_table,"a"
+	.align 4
+	.long 1b,cstar_fault
+.previous
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	jnz cstar_trace_entry
+	cmpl $NR_syscalls,%eax
+	jae cstar_badsys
+.Lcstar_call:
+	btl %eax,cstar_special
+	jc .Lcstar_special
+	call *cstar_call_table(,%eax,4)
+	movl %eax,PT_EAX(%esp)		# store the return value
+.Lcstar_exit:
+	movl PT_ECX(%esp),%ecx
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	jmp syscall_exit
+.Lcstar_special:
+	movl PT_ECX(%esp),%ecx
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	jmp syscall_call
+GLOBAL(cstar_set_tif)
+	movl $cstar_clear_tif,(%esp)	# replace return address
+	LOCK_PREFIX
+	orl $_TIF_CSTAR,TI_flags(%ebp)
+	jmp *sys_call_table(,%eax,4)
+cstar_clear_tif:
+	movl %eax,PT_EAX(%esp)		# store the return value
+	LOCK_PREFIX
+	andl $~_TIF_CSTAR,TI_flags(%ebp)
+	jmp .Lcstar_exit
+cstar_trace_entry:
+	movl $-ENOSYS,PT_EAX(%esp)
+	cmpl $NR_syscalls,%eax
+	jae 1f
+	btl %eax,cstar_special
+	jc .Lcstar_trace_special
+1:	movl %esp,%eax
+	LOCK_PREFIX
+	orl $_TIF_CSTAR,TI_flags(%ebp)
+	call syscall_trace_enter
+	LOCK_PREFIX
+	andl $~_TIF_CSTAR,TI_flags(%ebp)
+	/* What it returned is what we'll actually use.  */
+	cmpl $NR_syscalls,%eax
+	jb .Lcstar_call
+	jmp .Lcstar_exit
+.Lcstar_trace_special:
+	movl PT_ECX(%esp),%ecx
+	movl %esp,%eax
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	call syscall_trace_enter
+	/* What it returned is what we'll actually use.  */
+	cmpl $NR_syscalls,%eax
+	jb syscall_call
+	jmp syscall_exit
+cstar_badsys:
+	movl $-ENOSYS,PT_EAX(%esp)
+.Lcstar_resume:
+	movl PT_ECX(%esp),%ecx
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	jmp resume_userspace
+	CFI_RESTORE_STATE
+cstar_fault:
+	movl $-EFAULT,%eax
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+	jmp .Lcstar_resume
+	CFI_ENDPROC
+ENDPROC(ia32pv_cstar_target)
+
+ENTRY(cstar_ret_from_fork)
+	CFI_STARTPROC
+	movl PT_ECX(%esp),%ecx
+	GET_THREAD_INFO(%ebp)
+	movl %ecx,PT_EBP(%esp)		# put user EBP back in place
+	LOCK_PREFIX
+	andl $~_TIF_CSTAR,TI_flags(%ebp)
+	jmp ret_from_fork
+	CFI_ENDPROC
+END(cstar_ret_from_fork)
+
+#include <asm/unistd.h>
+.pushsection .rodata,"a"
+.balign 4
+cstar_special:
+nr=0
+mask=0
+.rept NR_syscalls+31
+ .irp n, __NR_sigreturn, __NR_rt_sigreturn
+  .if nr == \n
+   mask = mask | (1 << (\n & 31))
+  .endif
+ .endr
+ nr = nr + 1
+ .if (nr & 31) == 0
+  .long mask
+  mask = 0
+ .endif
+.endr
+.popsection
+#endif /* TIF_CSTAR */
+
+/*
+ * Some functions should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
+
+ENTRY(page_fault)
+	RING0_EC_FRAME
+	pushl_cfi $do_page_fault
+	ALIGN
+error_code:
+	/* the function address is in %gs's slot on the stack */
+	pushl_cfi %fs
+	/*CFI_REL_OFFSET fs, 0*/
+	pushl_cfi %es
+	/*CFI_REL_OFFSET es, 0*/
+	pushl_cfi %ds
+	/*CFI_REL_OFFSET ds, 0*/
+	pushl_cfi %eax
+	CFI_REL_OFFSET eax, 0
+	pushl_cfi %ebp
+	CFI_REL_OFFSET ebp, 0
+	pushl_cfi %edi
+	CFI_REL_OFFSET edi, 0
+	pushl_cfi %esi
+	CFI_REL_OFFSET esi, 0
+	pushl_cfi %edx
+	CFI_REL_OFFSET edx, 0
+	pushl_cfi %ecx
+	CFI_REL_OFFSET ecx, 0
+	pushl_cfi %ebx
+	CFI_REL_OFFSET ebx, 0
+	cld
+	movl $(__KERNEL_PERCPU), %ecx
+	movl %ecx, %fs
+	UNWIND_ESPFIX_STACK
+	GS_TO_REG %ecx
+	movl PT_GS(%esp), %edi		# get the function address
+	movl PT_ORIG_EAX(%esp), %edx	# get the error code
+	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
+	REG_TO_PTGS %ecx
+	SET_KERNEL_GS %ecx
+	movl $(__USER_DS), %ecx
+	movl %ecx, %ds
+	movl %ecx, %es
+	TRACE_IRQS_OFF
+	movl %esp,%eax			# pt_regs pointer
+	call *%edi
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(page_fault)
+
+#ifndef CONFIG_XEN
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+.macro FIX_STACK offset ok label
+	cmpw $__KERNEL_CS, 4(%esp)
+	jne \ok
+\label:
+	movl TSS_sysenter_sp0 + \offset(%esp), %esp
+	CFI_DEF_CFA esp, 0
+	CFI_UNDEFINED eip
+	pushfl_cfi
+	pushl_cfi $__KERNEL_CS
+	pushl_cfi $sysenter_past_esp
+	CFI_REL_OFFSET eip, 0
+.endm
+#endif /* CONFIG_XEN */
+
+ENTRY(debug)
+	RING0_INT_FRAME
+#ifndef CONFIG_XEN
+	cmpl $ia32_sysenter_target,(%esp)
+	jne debug_stack_correct
+	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
+debug_stack_correct:
+#endif /* !CONFIG_XEN */
+	pushl_cfi $-1			# mark this as an int
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl %edx,%edx			# error code 0
+	movl %esp,%eax			# pt_regs pointer
+	call do_debug
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+	RING0_INT_FRAME
+	pushl_cfi %eax
+#ifndef CONFIG_XEN
+	movl %ss, %eax
+	cmpw $__ESPFIX_SS, %ax
+	popl_cfi %eax
+	je nmi_espfix_stack
+	cmpl $ia32_sysenter_target,(%esp)
+	je nmi_stack_fixup
+	pushl_cfi %eax
+	movl %esp,%eax
+	/* Do not access memory above the end of our stack page,
+	 * it might not exist.
+	 */
+	andl $(THREAD_SIZE-1),%eax
+	cmpl $(THREAD_SIZE-20),%eax
+	popl_cfi %eax
+	jae nmi_stack_correct
+	cmpl $ia32_sysenter_target,12(%esp)
+	je nmi_debug_stack_check
+nmi_stack_correct:
+	/* We have a RING0_INT_FRAME here */
+	pushl_cfi %eax
+	SAVE_ALL
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_nmi
+	jmp restore_all_notrace
+	CFI_ENDPROC
+
+nmi_stack_fixup:
+	RING0_INT_FRAME
+	FIX_STACK 12, nmi_stack_correct, 1
+	jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+	/* We have a RING0_INT_FRAME here */
+	cmpw $__KERNEL_CS,16(%esp)
+	jne nmi_stack_correct
+	cmpl $debug,(%esp)
+	jb nmi_stack_correct
+	cmpl $debug_esp_fix_insn,(%esp)
+	ja nmi_stack_correct
+	FIX_STACK 24, nmi_stack_correct, 1
+	jmp nmi_stack_correct
+
+nmi_espfix_stack:
+	/* We have a RING0_INT_FRAME here.
+	 *
+	 * create the pointer to lss back
+	 */
+	pushl_cfi %ss
+	pushl_cfi %esp
+	addl $4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl_cfi 16(%esp)
+	.endr
+	pushl_cfi %eax
+	SAVE_ALL
+	FIXUP_ESPFIX_STACK		# %eax == %esp
+	xorl %edx,%edx			# zero error code
+	call do_nmi
+	RESTORE_REGS
+	lss 12+4(%esp), %esp		# back to espfix stack
+	CFI_ADJUST_CFA_OFFSET -24
+	jmp irq_return
+#else
+	SAVE_ALL
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_nmi
+	orl  $NMI_MASK, PT_EFLAGS(%esp)
+	jmp restore_all
+#endif
+	CFI_ENDPROC
+END(nmi)
+
+ENTRY(int3)
+	RING0_INT_FRAME
+	pushl_cfi $-1			# mark this as an int
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl %edx,%edx		# zero error code
+	movl %esp,%eax		# pt_regs pointer
+	call do_int3
+	jmp ret_from_exception
+	CFI_ENDPROC
+END(int3)
+
+ENTRY(general_protection)
+	RING0_EC_FRAME
+	pushl_cfi $do_general_protection
+	jmp error_code
+	CFI_ENDPROC
+END(general_protection)
+
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+	RING0_EC_FRAME
+	pushl_cfi $do_async_page_fault
+	jmp error_code
+	CFI_ENDPROC
+END(async_page_fault)
+#endif
+
+/*
+ * End of kprobes section
+ */
+	.popsection
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 4441d47..b0d761c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -377,7 +377,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA esp, 0
 	CFI_REGISTER esp, ebp
-	movl TSS_sysenter_sp0(%esp),%esp
+	movl SYSENTER_stack_sp0(%esp),%esp
 sysenter_past_esp:
 	/*
 	 * Interrupts are disabled here, but we can't trace it until
@@ -1045,7 +1045,7 @@ ENTRY(kernel_thread_helper)
 	CFI_ENDPROC
 ENDPROC(kernel_thread_helper)
 
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 /* Xen doesn't set %esp to be precisely what the normal sysenter
    entrypoint expects, so fix it up before using the normal path. */
 ENTRY(xen_sysenter_target)
@@ -1137,7 +1137,7 @@ ENDPROC(xen_failsafe_callback)
 BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
 		xen_evtchn_do_upcall)
 
-#endif	/* CONFIG_XEN */
+#endif	/* CONFIG_PARAVIRT_XEN */
 
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -1300,7 +1300,7 @@ END(page_fault)
  * that sets up the real kernel stack. Check here, since we can't
  * allow the wrong stack to be used.
  *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
  * already pushed 3 words if it hits on the sysenter instruction:
  * eflags, cs and eip.
  *
@@ -1312,7 +1312,7 @@ END(page_fault)
 	cmpw $__KERNEL_CS, 4(%esp)
 	jne \ok
 \label:
-	movl TSS_sysenter_sp0 + \offset(%esp), %esp
+	movl SYSENTER_stack_sp0 + \offset(%esp), %esp
 	CFI_DEF_CFA esp, 0
 	CFI_UNDEFINED eip
 	pushfl_cfi
diff --git a/arch/x86/kernel/entry_64-xen.S b/arch/x86/kernel/entry_64-xen.S
new file mode 100644
index 0000000..0f08b7a
--- /dev/null
+++ b/arch/x86/kernel/entry_64-xen.S
@@ -0,0 +1,1385 @@
+/*
+ *  linux/arch/x86_64/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *  Jun Nakajima <jun.nakajima@intel.com>
+ *  Asit Mallick <asit.k.mallick@intel.com>
+ *      Modified for Xen
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after an interrupt and after each system call.
+ *
+ * Normal syscalls and interrupts don't save a full stack frame, this is
+ * only done for syscall tracing, signals or fork/exec et.al.
+ *
+ * A note on terminology:
+ * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
+ * - partial stack frame: partially saved registers up to R11.
+ * - full stack frame: Like partial stack frame, but all register saved.
+ *
+ * Some macro usage:
+ * - CFI macros are used to generate dwarf2 unwind information for better
+ * backtraces. They don't change any code.
+ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
+ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
+ * There are unfortunately lots of special cases where some registers
+ * not touched. The macro is a big mess that should be cleaned up.
+ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
+ * Gives a full stack frame.
+ * - ENTRY/END Define functions in the symbol table.
+ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
+ * frame that is otherwise undefined after a SYSCALL
+ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
+ * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/page_types.h>
+#include <asm/irqflags.h>
+#include <asm/processor-flags.h>
+#include <asm/ftrace.h>
+#include <asm/percpu.h>
+#include <linux/err.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/features.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT 0x80000000
+#define __AUDIT_ARCH_LE	   0x40000000
+
+	.code64
+	.section .entry.text, "ax"
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+	retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
+	MCOUNT_SAVE_FRAME
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+GLOBAL(ftrace_call)
+	call ftrace_stub
+
+	MCOUNT_RESTORE_FRAME
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+GLOBAL(ftrace_graph_call)
+	jmp ftrace_stub
+#endif
+
+GLOBAL(ftrace_stub)
+	retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+ENTRY(mcount)
+	cmpl $0, function_trace_stop
+	jne  ftrace_stub
+
+	cmpq $ftrace_stub, ftrace_trace_function
+	jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpq $ftrace_stub, ftrace_graph_return
+	jnz ftrace_graph_caller
+
+	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz ftrace_graph_caller
+#endif
+
+GLOBAL(ftrace_stub)
+	retq
+
+trace:
+	MCOUNT_SAVE_FRAME
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
+
+	call   *ftrace_trace_function
+
+	MCOUNT_RESTORE_FRAME
+
+	jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	cmpl $0, function_trace_stop
+	jne ftrace_stub
+
+	MCOUNT_SAVE_FRAME
+
+	leaq 8(%rbp), %rdi
+	movq 0x38(%rsp), %rsi
+	movq (%rbp), %rdx
+	subq $MCOUNT_INSN_SIZE, %rsi
+
+	call	prepare_ftrace_return
+
+	MCOUNT_RESTORE_FRAME
+
+	retq
+END(ftrace_graph_caller)
+
+GLOBAL(return_to_handler)
+	subq  $24, %rsp
+
+	/* Save the return values */
+	movq %rax, (%rsp)
+	movq %rdx, 8(%rsp)
+	movq %rbp, %rdi
+
+	call ftrace_return_to_handler
+
+	movq %rax, %rdi
+	movq 8(%rsp), %rdx
+	movq (%rsp), %rax
+	addq $24, %rsp
+	jmp *%rdi
+#endif
+
+
+#ifndef CONFIG_PREEMPT
+#define retint_kernel retint_restore_args
+#endif
+
+
+.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+#ifdef CONFIG_TRACE_IRQFLAGS
+	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
+	jnc  1f
+	TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+NMI_MASK = 0x80000000
+	
+/*
+ * C code is not supposed to know about undefined top of stack. Every time
+ * a C function with an pt_regs argument is called from the SYSCALL based
+ * fast path FIXUP_TOP_OF_STACK is needed.
+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
+ * manipulation.
+ */
+
+	/* %rsp:at FRAMEEND */
+	.macro FIXUP_TOP_OF_STACK tmp offset=0
+	movq $__USER_CS,CS+\offset(%rsp)
+	movq $-1,RCX+\offset(%rsp)
+	.endm
+
+	.macro RESTORE_TOP_OF_STACK tmp offset=0
+	.endm
+
+	.macro FAKE_STACK_FRAME child_rip
+	/* push in order ss, rsp, eflags, cs, rip */
+	xorl %eax, %eax
+	pushq_cfi $__KERNEL_DS /* ss */
+	/*CFI_REL_OFFSET	ss,0*/
+	pushq_cfi %rax /* rsp */
+	CFI_REL_OFFSET	rsp,0
+	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
+	/*CFI_REL_OFFSET	rflags,0*/
+	pushq_cfi $__KERNEL_CS /* cs */
+	/*CFI_REL_OFFSET	cs,0*/
+	pushq_cfi \child_rip /* rip */
+	CFI_REL_OFFSET	rip,0
+	pushq_cfi %rax /* orig rax */
+	.endm
+
+	.macro UNFAKE_STACK_FRAME
+	addq $8*6, %rsp
+	CFI_ADJUST_CFA_OFFSET	-(6*8)
+	.endm
+
+/*
+ * initial frame state for syscall
+ */
+	.macro BASIC_FRAME start=1 offset=0
+	.if \start
+	CFI_STARTPROC simple
+	CFI_SIGNAL_FRAME
+	CFI_DEF_CFA rsp, SS+8+\offset-RIP
+	.else
+	CFI_DEF_CFA_OFFSET SS+8+\offset-RIP
+	.endif
+	/*CFI_REL_OFFSET ss, SS+\offset-RIP*/
+	CFI_REL_OFFSET rsp, RSP+\offset-RIP
+	/*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
+	/*CFI_REL_OFFSET cs, CS+\offset-RIP*/
+	CFI_REL_OFFSET rip, RIP+\offset-RIP
+	.endm
+
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+	.macro INTR_FRAME start=1 offset=0
+	.if \start == 1
+	BASIC_FRAME 1, \offset+2*8
+	CFI_REL_OFFSET rcx, 0+\offset
+	CFI_REL_OFFSET r11, 8+\offset
+	.else
+	BASIC_FRAME \start, \offset
+	.endif
+	.endm
+
+/*
+ * initial frame state for exceptions with error code (and interrupts
+ * with vector already pushed)
+ */
+	.macro XCPT_FRAME start=1 offset=0
+	INTR_FRAME \start, RIP+\offset-ORIG_RAX
+	.endm
+
+/*
+ * frame that enables calling into C.
+ */
+	.macro PARTIAL_FRAME start=1 offset=0
+	.if \start >= 0
+	XCPT_FRAME 2*\start, ORIG_RAX+\offset-ARGOFFSET
+	.endif
+	CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
+	CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
+	CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
+	CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
+	CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
+	CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
+	CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
+	CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
+	CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+	.endm
+
+/*
+ * frame that enables passing a complete pt_regs to a C function.
+ */
+	.macro DEFAULT_FRAME start=1 offset=0
+	.if \start >= -1
+	PARTIAL_FRAME \start, R11+\offset-R15
+	.endif
+	CFI_REL_OFFSET rbx, RBX+\offset
+	CFI_REL_OFFSET rbp, RBP+\offset
+	CFI_REL_OFFSET r12, R12+\offset
+	CFI_REL_OFFSET r13, R13+\offset
+	CFI_REL_OFFSET r14, R14+\offset
+	CFI_REL_OFFSET r15, R15+\offset
+	.endm
+
+        /*
+         * Must be consistent with the definition in arch-x86/xen-x86_64.h:
+         *     struct iret_context {
+         *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+         *     };
+         * with rax, r11, and rcx being taken care of in the hypercall stub.
+         */
+	.macro HYPERVISOR_IRET flag
+	.if \flag == 0	# return from syscall always uses the hypercall
+	testb $3,1*8(%rsp)
+	jnz   2f
+	testl $NMI_MASK,2*8(%rsp)
+	jnz   2f
+
+	cmpb  $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
+	jne   1f
+
+	/* Direct iret to kernel space. Correct CS and SS. */
+	orl   $3,1*8(%rsp)
+	orl   $3,4*8(%rsp)
+1:	iretq
+	.endif
+
+2:	/* Slow iret via hypervisor. */
+	andl  $~NMI_MASK, 2*8(%rsp)
+	pushq $\flag & VGCF_in_syscall
+	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
+	.endm
+
+#ifndef CONFIG_XEN
+/* save partial stack frame */
+	.macro SAVE_ARGS_IRQ
+	cld
+	/* start from rbp in pt_regs and jump over */
+	movq_cfi rdi, RDI-RBP
+	movq_cfi rsi, RSI-RBP
+	movq_cfi rdx, RDX-RBP
+	movq_cfi rcx, RCX-RBP
+	movq_cfi rax, RAX-RBP
+	movq_cfi  r8,  R8-RBP
+	movq_cfi  r9,  R9-RBP
+	movq_cfi r10, R10-RBP
+	movq_cfi r11, R11-RBP
+
+	/* Save rbp so that we can unwind from get_irq_regs() */
+	movq_cfi rbp, 0
+
+	/* Save previous stack value */
+	movq %rsp, %rsi
+
+	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
+	testl $3, CS(%rdi)
+	je 1f
+	SWAPGS
+	/*
+	 * irq_count is used to check if a CPU is already on an interrupt stack
+	 * or not. While this is essentially redundant with preempt_count it is
+	 * a little cheaper to use a separate counter in the PDA (short of
+	 * moving irq_enter into assembly, which would be too much work)
+	 */
+1:	incl PER_CPU_VAR(irq_count)
+	jne 2f
+	mov PER_CPU_VAR(irq_stack_ptr),%rsp
+	CFI_DEF_CFA_REGISTER	rsi
+
+2:	/* Store previous stack value */
+	pushq %rsi
+	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
+			0x77 /* DW_OP_breg7 */, 0, \
+			0x06 /* DW_OP_deref */, \
+			0x08 /* DW_OP_const1u */, SS+8-RBP, \
+			0x22 /* DW_OP_plus */
+	/* We entered an interrupt context - irqs are off: */
+	TRACE_IRQS_OFF
+	.endm
+#endif
+
+ENTRY(save_rest)
+	CFI_STARTPROC
+	movq 5*8+16(%rsp), %r11	/* save return address */
+	movq %rbx, RBX+16(%rsp)
+	movq %rbp, RBP+16(%rsp)
+	movq %r12, R12+16(%rsp)
+	movq %r13, R13+16(%rsp)
+	movq %r14, R14+16(%rsp)
+	movq %r15, R15+16(%rsp)
+	movq %r11, 8(%rsp)	/* return address */
+	FIXUP_TOP_OF_STACK %r11, 16
+	ret
+	CFI_ENDPROC
+END(save_rest)
+
+#ifndef CONFIG_XEN
+/* save complete stack frame */
+	.pushsection .kprobes.text, "ax"
+ENTRY(save_paranoid)
+	XCPT_FRAME offset=ORIG_RAX-R15+8
+	cld
+	movq %rdi, RDI+8(%rsp)
+	movq %rsi, RSI+8(%rsp)
+	movq_cfi rdx, RDX+8
+	movq_cfi rcx, RCX+8
+	movq_cfi rax, RAX+8
+	movq %r8, R8+8(%rsp)
+	movq %r9, R9+8(%rsp)
+	movq %r10, R10+8(%rsp)
+	movq %r11, R11+8(%rsp)
+	movq_cfi rbx, RBX+8
+	movq %rbp, RBP+8(%rsp)
+	movq %r12, R12+8(%rsp)
+	movq %r13, R13+8(%rsp)
+	movq %r14, R14+8(%rsp)
+	movq %r15, R15+8(%rsp)
+	movl $1,%ebx
+	movl $MSR_GS_BASE,%ecx
+	rdmsr
+	testl %edx,%edx
+	js 1f	/* negative -> in kernel */
+	SWAPGS
+	xorl %ebx,%ebx
+1:	ret
+	CFI_ENDPROC
+END(save_paranoid)
+	.popsection
+#endif
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+	DEFAULT_FRAME
+
+	LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+	pushq_cfi kernel_eflags(%rip)
+	popfq_cfi				# reset kernel eflags
+
+	call schedule_tail			# rdi: 'prev' task parameter
+
+	GET_THREAD_INFO(%rcx)
+
+	RESTORE_REST
+
+	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
+	jnz  1f
+	/* Need to set the proper %ss (not NULL) for ring 3 iretq */
+	movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
+	jmp  retint_restore_args
+1:
+	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
+	jnz  int_ret_from_sys_call
+
+	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+	jmp ret_from_sys_call			# go to the SYSRET fastpath
+
+	CFI_ENDPROC
+END(ret_from_fork)
+
+/*
+ * System call entry. Up to 6 arguments in registers are supported.
+ *
+ * SYSCALL does not save anything on the stack and does not change the
+ * stack pointer.
+ */
+
+/*
+ * Register setup:
+ * rax  system call number
+ * rdi  arg0
+ * rcx  return address for syscall/sysret, C arg3
+ * rsi  arg1
+ * rdx  arg2
+ * r10  arg3 	(--> moved to rcx for C)
+ * r8   arg4
+ * r9   arg5
+ * r11  eflags for syscall/sysret, temporary for C
+ * r12-r15,rbp,rbx saved by C code, not touched.
+ *
+ * Interrupts are enabled on entry.
+ * Only called from user space.
+ *
+ * XXX	if we had a free scratch register we could save the RSP into the stack frame
+ *      and report it properly in ps. Unfortunately we haven't.
+ *
+ * When user can change the frames always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
+ENTRY(system_call)
+	INTR_FRAME start=2 offset=2*8
+	SAVE_ARGS -8,0
+	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	jnz tracesys
+system_call_fastpath:
+	cmpq $__NR_syscall_max,%rax
+	ja badsys
+	movq %r10,%rcx
+	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
+	movq %rax,RAX-ARGOFFSET(%rsp)
+/*
+ * Syscall return path ending with SYSRET (fast path)
+ * Has incomplete stack frame and undefined top of stack.
+ */
+ret_from_sys_call:
+	movl $_TIF_ALLWORK_MASK,%edi
+	/* edi:	flagmask */
+sysret_check:
+	LOCKDEP_SYS_EXIT
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
+	andl %edi,%edx
+	jnz  sysret_careful
+	CFI_REMEMBER_STATE
+	/*
+	 * sysretq will re-enable interrupts:
+	 */
+	TRACE_IRQS_ON
+	RESTORE_ARGS 1,8,0,0
+	xor %ecx,%ecx
+	xor %r11,%r11
+        HYPERVISOR_IRET VGCF_IN_SYSCALL
+
+	CFI_RESTORE_STATE
+	/* Handle reschedules */
+	/* edx:	work, edi: workmask */
+sysret_careful:
+	bt $TIF_NEED_RESCHED,%edx
+	jnc sysret_signal
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	pushq_cfi %rdi
+	call schedule
+	popq_cfi %rdi
+	jmp sysret_check
+
+	/* Handle a signal */
+sysret_signal:
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+#ifdef CONFIG_AUDITSYSCALL
+	bt $TIF_SYSCALL_AUDIT,%edx
+	jc sysret_audit
+#endif
+	/*
+	 * We have a signal, or exit tracing or single-step.
+	 * These all wind up with the iret return path anyway,
+	 * so just join that path right now.
+	 */
+	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
+	jmp int_check_syscall_exit_work
+
+badsys:
+	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+	jmp ret_from_sys_call
+
+#ifdef CONFIG_AUDITSYSCALL
+	/*
+	 * Fast path for syscall audit without full syscall trace.
+	 * We just call __audit_syscall_entry() directly, and then
+	 * jump back to the normal fast path.
+	 */
+auditsys:
+	movq %r10,%r9			/* 6th arg: 4th syscall arg */
+	movq %rdx,%r8			/* 5th arg: 3rd syscall arg */
+	movq %rsi,%rcx			/* 4th arg: 2nd syscall arg */
+	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
+	movq %rax,%rsi			/* 2nd arg: syscall number */
+	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
+	call __audit_syscall_entry
+	LOAD_ARGS 0		/* reload call-clobbered registers */
+	jmp system_call_fastpath
+
+	/*
+	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
+	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
+	 * masked off.
+	 */
+sysret_audit:
+	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
+	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
+	setbe %al		/* 1 if so, 0 if not */
+	movzbl %al,%edi		/* zero-extend that into %edi */
+	call __audit_syscall_exit
+	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
+	jmp sysret_check
+#endif	/* CONFIG_AUDITSYSCALL */
+
+	/* Do syscall tracing */
+tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	jz auditsys
+#endif
+	SAVE_REST
+	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+	FIXUP_TOP_OF_STACK %rdi
+	movq %rsp,%rdi
+	call syscall_trace_enter
+	/*
+	 * Reload arg registers from stack in case ptrace changed them.
+	 * We don't reload %rax because syscall_trace_enter() returned
+	 * the value it wants us to use in the table lookup.
+	 */
+	LOAD_ARGS ARGOFFSET, 1
+	RESTORE_REST
+	cmpq $__NR_syscall_max,%rax
+	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
+	movq %r10,%rcx	/* fixup for C */
+	call *sys_call_table(,%rax,8)
+	movq %rax,RAX-ARGOFFSET(%rsp)
+	/* Use IRET because user could have changed frame */
+
+/*
+ * Syscall return path ending with IRET.
+ * Has correct top of stack, but partial stack frame.
+ */
+GLOBAL(int_ret_from_sys_call)
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	movl $_TIF_ALLWORK_MASK,%edi
+	/* edi:	mask to check */
+GLOBAL(int_with_check)
+	LOCKDEP_SYS_EXIT_IRQ
+	GET_THREAD_INFO(%rcx)
+	movl TI_flags(%rcx),%edx
+	andl %edi,%edx
+	jnz   int_careful
+	andl    $~TS_COMPAT,TI_status(%rcx)
+	jmp   retint_restore_args
+
+	/* Either reschedule or signal or syscall exit tracking needed. */
+	/* First do a reschedule test. */
+	/* edx:	work, edi: workmask */
+int_careful:
+	bt $TIF_NEED_RESCHED,%edx
+	jnc  int_very_careful
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	pushq_cfi %rdi
+	call schedule
+	popq_cfi %rdi
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	jmp int_with_check
+
+	/* handle signals and tracing -- both require a full stack frame */
+int_very_careful:
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+int_check_syscall_exit_work:
+	SAVE_REST
+	/* Check for syscall exit trace */
+	testl $_TIF_WORK_SYSCALL_EXIT,%edx
+	jz int_signal
+	pushq_cfi %rdi
+	leaq 8(%rsp),%rdi	# &ptregs -> arg1
+	call syscall_trace_leave
+	popq_cfi %rdi
+	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
+	jmp int_restore_rest
+
+int_signal:
+	testl $_TIF_DO_NOTIFY_MASK,%edx
+	jz 1f
+	movq %rsp,%rdi		# &ptregs -> arg1
+	xorl %esi,%esi		# oldset -> arg2
+	call do_notify_resume
+1:	movl $_TIF_WORK_MASK,%edi
+int_restore_rest:
+	RESTORE_REST
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	jmp int_with_check
+	CFI_ENDPROC
+END(system_call)
+
+/*
+ * Certain special system calls that need to save a complete full stack frame.
+ */
+	.macro PTREGSCALL label,func,arg
+ENTRY(\label)
+	PARTIAL_FRAME 1 8		/* offset 8: return address */
+	subq $REST_SKIP, %rsp
+	CFI_ADJUST_CFA_OFFSET REST_SKIP
+	call save_rest
+	DEFAULT_FRAME -2 8		/* offset 8: return address */
+	leaq 8(%rsp), \arg	/* pt_regs pointer */
+	call \func
+	jmp ptregscall_common
+	CFI_ENDPROC
+END(\label)
+	.endm
+
+	PTREGSCALL stub_clone, sys_clone, %r8
+	PTREGSCALL stub_fork, sys_fork, %rdi
+	PTREGSCALL stub_vfork, sys_vfork, %rdi
+	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
+	PTREGSCALL stub_iopl, sys_iopl, %rsi
+
+ENTRY(ptregscall_common)
+	DEFAULT_FRAME 1 8	/* offset 8: return address */
+	RESTORE_TOP_OF_STACK %r11, 8
+	movq_cfi_restore R15+8, r15
+	movq_cfi_restore R14+8, r14
+	movq_cfi_restore R13+8, r13
+	movq_cfi_restore R12+8, r12
+	movq_cfi_restore RBP+8, rbp
+	movq_cfi_restore RBX+8, rbx
+	ret $REST_SKIP		/* pop extended registers */
+	CFI_ENDPROC
+END(ptregscall_common)
+
+ENTRY(stub_execve)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	movq %rsp, %rcx
+	call sys_execve
+	RESTORE_TOP_OF_STACK %r11
+	movq %rax,RAX(%rsp)
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_execve)
+
+/*
+ * sigreturn is special because it needs to restore all registers on return.
+ * This cannot be done with SYSRET, so use the IRET return path instead.
+ */
+ENTRY(stub_rt_sigreturn)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	movq %rsp,%rdi
+	FIXUP_TOP_OF_STACK %r11
+	call sys_rt_sigreturn
+	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_rt_sigreturn)
+
+/*
+ * Interrupt exit.
+ */ 
+
+retint_with_reschedule:
+	PARTIAL_FRAME
+	movl $_TIF_WORK_MASK,%edi
+retint_check:
+	LOCKDEP_SYS_EXIT_IRQ
+	movl TI_flags(%rcx),%edx
+	andl %edi,%edx
+	CFI_REMEMBER_STATE
+	jnz  retint_careful
+retint_restore_args:	/* return to kernel space */
+	movl EFLAGS-REST_SKIP(%rsp), %eax
+	shr $9, %eax			# EAX[0] == IRET_EFLAGS.IF
+	GET_VCPU_INFO
+	andb evtchn_upcall_mask(%rsi),%al
+	andb $1,%al			# EAX[0] == IRET_EFLAGS.IF & event_mask
+	jnz restore_all_enable_events	#        != 0 => enable event delivery
+		
+	RESTORE_ARGS 1,8,1
+	HYPERVISOR_IRET 0
+	
+	/* edi: workmask, edx: work */
+retint_careful:
+	CFI_RESTORE_STATE
+	bt    $TIF_NEED_RESCHED,%edx
+	jnc   retint_signal
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	pushq_cfi %rdi
+	call  schedule
+	popq_cfi %rdi
+	GET_THREAD_INFO(%rcx)
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	jmp retint_check
+
+retint_signal:
+	testl $_TIF_DO_NOTIFY_MASK,%edx
+	jz    retint_restore_args
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	SAVE_REST
+	movq $-1,ORIG_RAX(%rsp)
+	xorl %esi,%esi		# oldset
+	movq %rsp,%rdi		# &pt_regs
+	call do_notify_resume
+	RESTORE_REST
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	GET_THREAD_INFO(%rcx)
+	jmp retint_with_reschedule
+
+#ifdef CONFIG_PREEMPT
+	/* Returning to kernel space. Check if we need preemption */
+	/* rcx:	 threadinfo. interrupts off. */
+ENTRY(retint_kernel)
+	cmpl $0,TI_preempt_count(%rcx)
+	jnz  retint_restore_args
+	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
+	jnc  retint_restore_args
+	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
+	jnc  retint_restore_args
+	call preempt_schedule_irq
+	jmp retint_kernel       /* check again */
+#endif
+
+	CFI_ENDPROC
+END(retint_check)
+
+#ifndef CONFIG_XEN
+/*
+ * APIC interrupts.
+ */
+.macro apicinterrupt num sym do_sym
+ENTRY(\sym)
+	INTR_FRAME
+	pushq_cfi $~(\num)
+	interrupt \do_sym
+	jmp error_entry
+	CFI_ENDPROC
+END(\sym)
+.endm
+
+#ifdef CONFIG_SMP
+apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+apicinterrupt REBOOT_VECTOR \
+	reboot_interrupt smp_reboot_interrupt
+#endif
+
+#ifdef CONFIG_X86_UV
+apicinterrupt UV_BAU_MESSAGE \
+	uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
+apicinterrupt LOCAL_TIMER_VECTOR \
+	apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR \
+	x86_platform_ipi smp_x86_platform_ipi
+
+#ifdef CONFIG_SMP
+	ALIGN
+	INTR_FRAME
+.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+.if NUM_INVALIDATE_TLB_VECTORS > \idx
+ENTRY(invalidate_interrupt\idx)
+	pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
+	jmp .Lcommon_invalidate_interrupt0
+	CFI_ADJUST_CFA_OFFSET -8
+END(invalidate_interrupt\idx)
+.endif
+.endr
+	CFI_ENDPROC
+apicinterrupt INVALIDATE_TLB_VECTOR_START, \
+	invalidate_interrupt0, smp_invalidate_interrupt
+#endif
+
+apicinterrupt THRESHOLD_APIC_VECTOR \
+	threshold_interrupt smp_threshold_interrupt
+apicinterrupt THERMAL_APIC_VECTOR \
+	thermal_interrupt smp_thermal_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
+	call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR \
+	call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR \
+	reschedule_interrupt smp_reschedule_interrupt
+#endif
+
+apicinterrupt ERROR_APIC_VECTOR \
+	error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR \
+	spurious_interrupt smp_spurious_interrupt
+
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+	irq_work_interrupt smp_irq_work_interrupt
+#endif
+#endif /* !CONFIG_XEN */
+
+/*
+ * Exception entry points.
+ */
+.macro zeroentry sym do_sym
+ENTRY(\sym)
+	INTR_FRAME
+        movq (%rsp),%rcx
+	CFI_RESTORE rcx
+        movq 8(%rsp),%r11
+	CFI_RESTORE r11
+	movq $-1,8(%rsp)	/* ORIG_RAX: no syscall to restart */
+	subq $ORIG_RAX-R15-1*8,%rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-1*8
+	call error_entry
+	DEFAULT_FRAME -1
+	movq %rsp,%rdi		/* pt_regs pointer */
+	xorl %esi,%esi		/* no error code */
+	call \do_sym
+	jmp error_exit		/* %ebx: no swapgs flag */
+	CFI_ENDPROC
+END(\sym)
+.endm
+
+.macro paranoidzeroentry sym do_sym
+	zeroentry \sym \do_sym
+.endm
+
+.macro paranoidzeroentry_ist sym do_sym ist
+	zeroentry \sym \do_sym
+.endm
+
+.macro errorentry sym do_sym
+ENTRY(\sym)
+	XCPT_FRAME
+        movq (%rsp),%rcx
+	CFI_RESTORE rcx
+        movq 8(%rsp),%r11
+	CFI_RESTORE r11
+	subq $ORIG_RAX-R15-2*8,%rsp
+	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-2*8
+	call error_entry
+	DEFAULT_FRAME -1
+	movq %rsp,%rdi			/* pt_regs pointer */
+	movq ORIG_RAX(%rsp),%rsi	/* get error code */
+	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
+	call \do_sym
+	jmp error_exit			/* %ebx: no swapgs flag */
+	CFI_ENDPROC
+END(\sym)
+.endm
+
+	/* error code is on the stack already */
+.macro paranoiderrorentry sym do_sym
+	errorentry \sym \do_sym
+.endm
+
+/*
+ * Copied from arch/xen/i386/kernel/entry.S
+ */               
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
+	CFI_STARTPROC
+# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+# see the correct pointer to the pt_regs
+	movq %rdi, %rsp            # we don't return, adjust the stack frame
+	CFI_ENDPROC
+	DEFAULT_FRAME
+11:	incl PER_CPU_VAR(irq_count)
+	movq %rsp,%rbp
+	CFI_DEF_CFA_REGISTER rbp
+	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+	pushq %rbp			# backlink for old unwinder
+	call evtchn_do_upcall
+	popq %rsp
+	CFI_DEF_CFA_REGISTER rsp
+	decl PER_CPU_VAR(irq_count)
+	jmp  error_exit
+	CFI_ENDPROC
+END(do_hypervisor_callback)
+
+        ALIGN
+restore_all_enable_events:  
+	PARTIAL_FRAME
+	TRACE_IRQS_ON
+	__ENABLE_INTERRUPTS
+
+scrit:	/**** START OF CRITICAL REGION ****/
+	__TEST_PENDING
+	CFI_REMEMBER_STATE
+	jnz  14f			# process more events if necessary...
+        RESTORE_ARGS 1,8,1
+        HYPERVISOR_IRET 0
+        
+	CFI_RESTORE_STATE
+14:	__DISABLE_INTERRUPTS
+	SAVE_REST
+        movq %rsp,%rdi                  # set the argument again
+	jmp  11b
+	CFI_ENDPROC
+ecrit:  /**** END OF CRITICAL REGION ****/
+# At this point, unlike on x86-32, we don't do the fixup to simplify the 
+# code and the stack frame is more complex on x86-64.
+# When the kernel is interrupted in the critical section, the kernel 
+# will do IRET in that case, and everything will be restored at that point, 
+# i.e. it just resumes from the next instruction interrupted with the same context. 
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+ENTRY(failsafe_callback)
+	INTR_FRAME offset=4*8
+	movw %ds,%cx
+	cmpw %cx,0x10(%rsp)
+	CFI_REMEMBER_STATE
+	jne 1f
+	movw %es,%cx
+	cmpw %cx,0x18(%rsp)
+	jne 1f
+	movw %fs,%cx
+	cmpw %cx,0x20(%rsp)
+	jne 1f
+	movw %gs,%cx
+	cmpw %cx,0x28(%rsp)
+	jne 1f
+	/* All segments match their saved values => Category 2 (Bad IRET). */
+	movq (%rsp),%rcx
+	CFI_RESTORE rcx
+	movq 8(%rsp),%r11
+	CFI_RESTORE r11
+	addq $0x30,%rsp
+	CFI_ADJUST_CFA_OFFSET -0x30
+	movq $11,%rdi	/* SIGSEGV */
+	jmp do_exit			
+	CFI_RESTORE_STATE
+1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+	movq (%rsp),%rcx
+	CFI_RESTORE rcx
+	movq 8(%rsp),%r11
+	CFI_RESTORE r11
+	addq $0x30,%rsp
+	CFI_ADJUST_CFA_OFFSET -0x30
+	pushq_cfi $0
+	SAVE_ALL
+	jmp error_exit
+	CFI_ENDPROC
+
+zeroentry divide_error do_divide_error
+zeroentry overflow do_overflow
+zeroentry bounds do_bounds
+zeroentry invalid_op do_invalid_op
+zeroentry device_not_available do_device_not_available
+zeroentry hypervisor_callback do_hypervisor_callback
+zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
+errorentry invalid_TSS do_invalid_TSS
+errorentry segment_not_present do_segment_not_present
+zeroentry coprocessor_error do_coprocessor_error
+errorentry alignment_check do_alignment_check
+zeroentry simd_coprocessor_error do_simd_coprocessor_error
+	
+ENTRY(kernel_thread_helper)
+	pushq $0		# fake return address
+	CFI_STARTPROC
+	/*
+	 * Here we are in the child and the registers are set as they were
+	 * at kernel_thread() invocation in the parent.
+	 */
+	call *%rsi
+	# exit
+	mov %eax, %edi
+	call do_exit
+	ud2			# padding for call trace
+	CFI_ENDPROC
+END(kernel_thread_helper)
+
+/*
+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
+ *
+ * C extern interface:
+ *	 extern long execve(const char *name, char **argv, char **envp)
+ *
+ * asm input arguments:
+ *	rdi: name, rsi: argv, rdx: envp
+ *
+ * We want to fallback into:
+ *	extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
+ *
+ * do_sys_execve asm fallback arguments:
+ *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
+ */
+ENTRY(kernel_execve)
+	CFI_STARTPROC
+	FAKE_STACK_FRAME $0
+	SAVE_ALL
+	movq %rsp,%rcx
+	call sys_execve
+	movq %rax, RAX(%rsp)
+	RESTORE_REST
+	testq %rax,%rax
+	jne 1f
+        jmp int_ret_from_sys_call
+1:      RESTORE_ARGS
+	UNFAKE_STACK_FRAME
+	ret
+	CFI_ENDPROC
+END(kernel_execve)
+
+/* Call softirq on interrupt stack. Interrupts are off. */
+ENTRY(call_softirq)
+	CFI_STARTPROC
+	pushq_cfi %rbp
+	CFI_REL_OFFSET rbp,0
+	mov  %rsp,%rbp
+	CFI_DEF_CFA_REGISTER rbp
+	incl PER_CPU_VAR(irq_count)
+	cmove PER_CPU_VAR(irq_stack_ptr),%rsp
+	push  %rbp			# backlink for old unwinder
+	call __do_softirq
+	leaveq
+	CFI_RESTORE		rbp
+	CFI_DEF_CFA_REGISTER	rsp
+	CFI_ADJUST_CFA_OFFSET   -8
+	decl PER_CPU_VAR(irq_count)
+	ret
+	CFI_ENDPROC
+END(call_softirq)
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+	CFI_STARTPROC
+	movq	%r15, R15(%rdi)
+	movq	%r14, R14(%rdi)
+	xchgq	%rsi, %rdx
+	movq	%r13, R13(%rdi)
+	movq	%r12, R12(%rdi)
+	xorl	%eax, %eax
+	movq	%rbp, RBP(%rdi)
+	movq	%rbx, RBX(%rdi)
+	movq	(%rsp), %r9
+	xchgq	%rdx, %rcx
+	movq	%rax, R11(%rdi)
+	movq	%rax, R10(%rdi)
+	movq	%rax, R9(%rdi)
+	movq	%rax, R8(%rdi)
+	movq	%rax, RAX(%rdi)
+	movq	%rax, RCX(%rdi)
+	movq	%rax, RDX(%rdi)
+	movq	%rax, RSI(%rdi)
+	movq	%rax, RDI(%rdi)
+	movq	%rax, ORIG_RAX(%rdi)
+	movq	%r9, RIP(%rdi)
+	leaq	8(%rsp), %r9
+	movq	$__KERNEL_CS, CS(%rdi)
+	movq	%rax, EFLAGS(%rdi)
+	movq	%r9, RSP(%rdi)
+	movq	$__KERNEL_DS, SS(%rdi)
+	jmpq	*%rcx
+	CFI_ENDPROC
+END(arch_unwind_init_running)
+#endif
+
+/*
+ * Some functions should be protected against kprobes
+ */
+	.pushsection .kprobes.text, "ax"
+
+paranoidzeroentry_ist debug do_debug DEBUG_STACK
+zeroentry nmi do_nmi_callback
+paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+paranoiderrorentry stack_segment do_stack_segment
+errorentry general_protection do_general_protection
+errorentry page_fault do_page_fault
+#ifdef CONFIG_KVM_GUEST
+errorentry async_page_fault do_async_page_fault
+#endif
+#ifdef CONFIG_X86_MCE
+paranoidzeroentry machine_check *machine_check_vector(%rip)
+#endif
+
+#ifndef CONFIG_XEN
+	/*
+	 * "Paranoid" exit path from exception stack.
+	 * Paranoid because this is used by NMIs and cannot take
+	 * any kernel state for granted.
+	 * We don't do kernel preemption checks here, because only
+	 * NMI should be common and it does not enable IRQs and
+	 * cannot get reschedule ticks.
+	 *
+	 * "trace" is 0 for the NMI handler only, because irq-tracing
+	 * is fundamentally NMI-unsafe. (we cannot change the soft and
+	 * hard flags at once, atomically)
+	 */
+
+	/* ebx:	no swapgs flag */
+ENTRY(paranoid_exit)
+	DEFAULT_FRAME
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	testl %ebx,%ebx				/* swapgs needed? */
+	jnz paranoid_restore
+	testl $3,CS(%rsp)
+	jnz   paranoid_userspace
+paranoid_swapgs:
+	TRACE_IRQS_IRETQ 0
+	SWAPGS_UNSAFE_STACK
+	RESTORE_ALL 8
+	jmp irq_return
+paranoid_restore:
+	TRACE_IRQS_IRETQ 0
+	RESTORE_ALL 8
+	jmp irq_return
+paranoid_userspace:
+	GET_THREAD_INFO(%rcx)
+	movl TI_flags(%rcx),%ebx
+	andl $_TIF_WORK_MASK,%ebx
+	jz paranoid_swapgs
+	movq %rsp,%rdi			/* &pt_regs */
+	call sync_regs
+	movq %rax,%rsp			/* switch stack for scheduling */
+	testl $_TIF_NEED_RESCHED,%ebx
+	jnz paranoid_schedule
+	movl %ebx,%edx			/* arg3: thread flags */
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	xorl %esi,%esi 			/* arg2: oldset */
+	movq %rsp,%rdi 			/* arg1: &pt_regs */
+	call do_notify_resume
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	jmp paranoid_userspace
+paranoid_schedule:
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_ANY)
+	call schedule
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	jmp paranoid_userspace
+	CFI_ENDPROC
+END(paranoid_exit)
+#endif
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack.
+ * returns in "no swapgs flag" in %ebx.
+ */
+ENTRY(error_entry)
+	XCPT_FRAME start=2 offset=ORIG_RAX-R15+8
+	/* oldrax contains error code */
+	cld
+	movq %rdi, RDI+8(%rsp)
+	movq %rsi, RSI+8(%rsp)
+	movq %rdx, RDX+8(%rsp)
+	movq %rcx, RCX+8(%rsp)
+	movq %rax, RAX+8(%rsp)
+	movq  %r8,  R8+8(%rsp)
+	movq  %r9,  R9+8(%rsp)
+	movq %r10, R10+8(%rsp)
+	movq %r11, R11+8(%rsp)
+	movq_cfi rbx, RBX+8
+	movq %rbp, RBP+8(%rsp)
+	movq %r12, R12+8(%rsp)
+	movq %r13, R13+8(%rsp)
+	movq %r14, R14+8(%rsp)
+	movq %r15, R15+8(%rsp)
+#ifndef CONFIG_XEN
+	xorl %ebx,%ebx
+	testl $3,CS+8(%rsp)
+	je error_kernelspace
+error_swapgs:
+	SWAPGS
+error_sti:
+#endif
+	TRACE_IRQS_OFF
+	ret
+
+#ifndef CONFIG_XEN
+/*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
+ */
+error_kernelspace:
+	CFI_REL_OFFSET rcx, RCX+8
+	incl %ebx
+	leaq irq_return(%rip),%rcx
+	cmpq %rcx,RIP+8(%rsp)
+	je error_swapgs
+	movl %ecx,%eax	/* zero extend */
+	cmpq %rax,RIP+8(%rsp)
+	je bstep_iret
+	cmpq $gs_change,RIP+8(%rsp)
+	je error_swapgs
+	jmp error_sti
+
+bstep_iret:
+	/* Fix truncated RIP */
+	movq %rcx,RIP+8(%rsp)
+	jmp error_swapgs
+#endif
+	CFI_ENDPROC
+END(error_entry)
+
+
+ENTRY(error_exit)
+	DEFAULT_FRAME
+	RESTORE_REST
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	GET_THREAD_INFO(%rcx)
+	testb $3,CS-ARGOFFSET(%rsp)
+	jz retint_kernel
+	LOCKDEP_SYS_EXIT_IRQ
+	movl TI_flags(%rcx),%edx
+	movl $_TIF_WORK_MASK,%edi
+	andl %edi,%edx
+	jnz retint_careful
+	jmp retint_restore_args
+	CFI_ENDPROC
+END(error_exit)
+
+
+#define extern #
+#include <asm-generic/percpu.h>
+
+.pushsection PER_CPU_BASE_SECTION, "aw", @progbits
+in_NMI:	.byte	0
+.popsection
+
+do_nmi_callback:
+	CFI_STARTPROC
+	addq $8, %rsp
+	CFI_ENDPROC
+	DEFAULT_FRAME
+	orb  $1, PER_CPU_VAR(in_NMI)
+	js   1f
+0:
+	movb $0x80, PER_CPU_VAR(in_NMI)
+	call do_nmi
+	movl $0x80, %eax
+	cmpxchgb %ah, PER_CPU_VAR(in_NMI)
+	jne  0b
+	orl  $NMI_MASK,EFLAGS(%rsp)
+1:
+	RESTORE_REST
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	GET_THREAD_INFO(%rcx)
+	jmp  retint_restore_args
+	CFI_ENDPROC
+END(do_nmi_callback)
+
+
+#ifndef CONFIG_IA32_EMULATION
+ENTRY(ignore_sysret)
+	INTR_FRAME
+	popq_cfi %rcx
+	CFI_RESTORE rcx
+	popq_cfi %r11
+	CFI_RESTORE r11
+	mov $-ENOSYS,%eax
+	# any non-zero value not having VGCF_in_syscall set will do:
+	HYPERVISOR_IRET VGCF_i387_valid
+	CFI_ENDPROC
+END(ignore_sysret)
+#endif
+
+/*
+ * End of kprobes section
+ */
+	.popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 499ceb5..78d0610 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1256,7 +1256,7 @@ ENTRY(arch_unwind_init_running)
 END(arch_unwind_init_running)
 #endif
 
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
 
 /*
@@ -1356,7 +1356,7 @@ END(xen_failsafe_callback)
 apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
 	xen_hvm_callback_vector xen_evtchn_do_upcall
 
-#endif /* CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
 
 /*
  * Some functions should be protected against kprobes
@@ -1366,7 +1366,7 @@ apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
 paranoidzeroentry_ist debug do_debug DEBUG_STACK
 paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
 paranoiderrorentry stack_segment do_stack_segment
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 zeroentry xen_debug do_debug
 zeroentry xen_int3 do_int3
 errorentry xen_stack_segment do_stack_segment
diff --git a/arch/x86/kernel/fixup.c b/arch/x86/kernel/fixup.c
new file mode 100644
index 0000000..64cd323
--- /dev/null
+++ b/arch/x86/kernel/fixup.c
@@ -0,0 +1,89 @@
+/******************************************************************************
+ * fixup.c
+ * 
+ * Binary-rewriting of certain IA32 instructions, on notification by Xen.
+ * Used to avoid repeated slow emulation of common instructions used by the
+ * user-space TLS (Thread-Local Storage) libraries.
+ * 
+ * **** NOTE ****
+ *  Issues with the binary rewriting have caused it to be removed. Instead
+ *  we rely on Xen's emulator to boot the kernel, and then print a banner
+ *  message recommending that the user disables /lib/tls.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/version.h>
+#include <asm/traps.h>
+
+#define DP(_f, _args...) pr_alert("  " _f "\n" , ## _args )
+
+dotraplinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
+{
+	static unsigned long printed = 0;
+	char info[100];
+	int i;
+
+	/* Ignore statically-linked init. */
+	if (current->tgid == 1)
+		return;
+            
+	VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable,
+				  VMASST_TYPE_4gb_segments_notify));
+
+	if (test_and_set_bit(0, &printed))
+		return;
+
+	sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
+
+	DP("");
+	DP("***************************************************************");
+	DP("***************************************************************");
+	DP("** WARNING: Currently emulating unsupported memory accesses  **");
+	DP("**          in /lib/tls glibc libraries. The emulation is    **");
+	DP("**          slow. To ensure full performance you should      **");
+	DP("**          install a 'xen-friendly' (nosegneg) version of   **");
+	DP("**          the library, or disable tls support by executing **");
+	DP("**          the following as root:                           **");
+	DP("**          mv /lib/tls /lib/tls.disabled                    **");
+	DP("** Offending process: %-38.38s **", info);
+	DP("***************************************************************");
+	DP("***************************************************************");
+	DP("");
+
+	for (i = 5; i > 0; i--) {
+		touch_softlockup_watchdog();
+		printk("Pausing... %d", i);
+		mdelay(1000);
+		printk("\b\b\b\b\b\b\b\b\b\b\b\b");
+	}
+
+	printk("Continuing...\n\n");
+}
+
+static int __init fixup_init(void)
+{
+	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+				     VMASST_TYPE_4gb_segments_notify));
+	return 0;
+}
+__initcall(fixup_init);
diff --git a/arch/x86/kernel/head-xen.c b/arch/x86/kernel/head-xen.c
new file mode 100644
index 0000000..b15fd60
--- /dev/null
+++ b/arch/x86/kernel/head-xen.c
@@ -0,0 +1,223 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/pci.h>
+
+#include <asm/setup.h>
+#ifndef CONFIG_XEN
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+void __init reserve_ebda_region(void)
+{
+	unsigned int lowmem, ebda_addr;
+
+	/* To determine the position of the EBDA and the */
+	/* end of conventional memory, we need to look at */
+	/* the BIOS data area. In a paravirtual environment */
+	/* that area is absent. We'll just have to assume */
+	/* that the paravirt case can handle memory setup */
+	/* correctly, without our help. */
+	if (paravirt_enabled())
+		return;
+
+	/* end of low (conventional) memory */
+	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+	lowmem <<= 10;
+
+	/* start of EBDA area */
+	ebda_addr = get_bios_ebda();
+
+	/* Fixup: bios puts an EBDA in the top 64K segment */
+	/* of conventional memory, but does not adjust lowmem. */
+	if ((lowmem - ebda_addr) <= 0x10000)
+		lowmem = ebda_addr;
+
+	/* Fixup: bios does not report an EBDA at all. */
+	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+		lowmem = 0x9f000;
+
+	/* Paranoia: should never happen, but... */
+	if ((lowmem == 0) || (lowmem >= 0x100000))
+		lowmem = 0x9f000;
+
+	/* reserve all memory between lowmem and the 1MB mark */
+	memblock_reserve(lowmem, 0x100000 - lowmem);
+}
+#else /* CONFIG_XEN */
+#include <linux/export.h>
+#include <asm/fixmap.h>
+#include <asm/mc146818rtc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void nmi(void);
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
+#else
+#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
+#endif
+
+unsigned long __initdata xen_initrd_start;
+
+unsigned long *__read_mostly machine_to_phys_mapping =
+	(void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned long __read_mostly machine_to_phys_nr;
+EXPORT_SYMBOL(machine_to_phys_nr);
+
+void __init xen_start_kernel(void)
+{
+	unsigned int i;
+	struct xen_machphys_mapping mapping;
+
+	xen_setup_features();
+
+	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+		machine_to_phys_nr = mapping.max_mfn + 1;
+	} else
+		machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
+#ifdef CONFIG_X86_32
+	WARN_ON(machine_to_phys_mapping + (machine_to_phys_nr - 1)
+		< machine_to_phys_mapping);
+#endif
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		phys_to_machine_mapping =
+			(unsigned long *)xen_start_info->mfn_list;
+
+	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+				     VMASST_TYPE_writable_pagetables));
+
+	memblock_reserve(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
+			 __pa(xen_start_info->pt_base)
+			 + PFN_PHYS(xen_start_info->nr_pt_frames));
+
+#ifdef CONFIG_X86_32
+{
+	extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
+	unsigned long addr;
+
+	/* Do an early initialization of the fixmap area */
+	make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
+	addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+	set_pmd(pmd_offset(pud_offset(swapper_pg_dir + pgd_index(addr),
+				      addr),
+			   addr),
+		__pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
+}
+#else
+	x86_configure_nx();
+	xen_init_pt();
+#endif
+
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
+			!= pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
+	FIX_BUG_ON(SHARED_INFO);
+	FIX_BUG_ON(ISAMAP_BEGIN);
+	FIX_BUG_ON(ISAMAP_END);
+#undef pmd_index
+#undef __FIXADDR_TOP
+
+	/* Switch to the real shared_info page, and clear the dummy page. */
+	set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+	HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+	clear_page(empty_zero_page);
+
+	setup_vcpu_info(0);
+
+	/* Set up mapping of lowest 1MB of physical memory. */
+	for (i = 0; i < NR_FIX_ISAMAPS; i++)
+		if (is_initial_xendomain())
+			set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+		else
+			__set_fixmap(FIX_ISAMAP_BEGIN - i,
+				     virt_to_machine(empty_zero_page),
+				     PAGE_KERNEL_RO);
+
+	if (is_initial_xendomain()) {
+		x86_platform.get_wallclock = mach_get_cmos_time;
+		x86_platform.set_wallclock = mach_set_rtc_mmss;
+
+		pci_request_acs();
+	} else
+		x86_init.resources.probe_roms = x86_init_noop;
+}
+
+void __init xen_arch_setup(void)
+{
+	int ret;
+	static const struct callback_register __initconst event = {
+		.type = CALLBACKTYPE_event,
+		.address = CALLBACK_ADDR(hypervisor_callback)
+	};
+	static const struct callback_register __initconst failsafe = {
+		.type = CALLBACKTYPE_failsafe,
+		.address = CALLBACK_ADDR(failsafe_callback)
+	};
+#ifdef CONFIG_X86_64
+	static const struct callback_register __initconst syscall = {
+		.type = CALLBACKTYPE_syscall,
+		.address = CALLBACK_ADDR(system_call)
+	};
+#endif
+	static const struct callback_register __initconst nmi_cb = {
+		.type = CALLBACKTYPE_nmi,
+		.address = CALLBACK_ADDR(nmi)
+	};
+
+	ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
+	if (ret == 0)
+		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
+#ifdef CONFIG_X86_64
+	if (ret == 0)
+		ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
+#endif
+#if CONFIG_XEN_COMPAT <= 0x030002
+#ifdef CONFIG_X86_32
+	if (ret == -ENOSYS)
+		ret = HYPERVISOR_set_callbacks(
+			event.address.cs, event.address.eip,
+			failsafe.address.cs, failsafe.address.eip);
+#else
+		ret = HYPERVISOR_set_callbacks(
+			event.address,
+			failsafe.address,
+			syscall.address);
+#endif
+#endif
+	BUG_ON(ret);
+
+	ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (ret == -ENOSYS) {
+		static struct xennmi_callback __initdata cb = {
+			.handler_address = (unsigned long)nmi
+		};
+
+		HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
+	}
+#endif
+}
+#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/head32-xen.c b/arch/x86/kernel/head32-xen.c
new file mode 100644
index 0000000..fcc893b
--- /dev/null
+++ b/arch/x86/kernel/head32-xen.c
@@ -0,0 +1,103 @@
+/*
+ *  linux/arch/i386/kernel/head32.c -- prepare to run common code
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/init.h>
+#include <linux/start_kernel.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/tlbflush.h>
+
+static void __init i386_default_early_setup(void)
+{
+	/* Initialize 32bit specific setup functions */
+	x86_init.resources.reserve_resources = i386_reserve_resources;
+#ifndef CONFIG_XEN
+	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
+
+	reserve_ebda_region();
+#endif
+}
+
+void __init i386_start_kernel(void)
+{
+#ifdef CONFIG_XEN
+	struct xen_platform_parameters pp;
+
+	WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+				     VMASST_TYPE_4gb_segments));
+
+	init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
+
+	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
+		hypervisor_virt_start = pp.virt_start;
+		reserve_top_address(0UL - pp.virt_start);
+	}
+
+	BUG_ON(pte_index(hypervisor_virt_start));
+#endif
+
+	memblock_reserve(__pa_symbol(&_text),
+			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
+
+#ifndef CONFIG_XEN
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* Reserve INITRD */
+	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+		/* Assume only end is not page aligned */
+		u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+		u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+		u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+		memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+	}
+#endif
+
+	/* Call the subarch specific early setup function */
+	switch (boot_params.hdr.hardware_subarch) {
+	case X86_SUBARCH_MRST:
+		x86_mrst_early_setup();
+		break;
+	case X86_SUBARCH_CE4100:
+		x86_ce4100_early_setup();
+		break;
+	default:
+		i386_default_early_setup();
+		break;
+	}
+#else
+#ifdef CONFIG_BLK_DEV_INITRD
+	BUG_ON(xen_start_info->flags & SIF_MOD_START_PFN);
+	if (xen_start_info->mod_start)
+		xen_initrd_start = __pa(xen_start_info->mod_start);
+#endif
+	{
+		int max_cmdline;
+
+		if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+			max_cmdline = COMMAND_LINE_SIZE;
+		memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
+		boot_command_line[max_cmdline-1] = '\0';
+	}
+
+	i386_default_early_setup();
+	xen_start_kernel();
+#endif
+
+	/*
+	 * At this point everything still needed from the boot loader
+	 * or BIOS or kernel text should be early reserved or marked not
+	 * RAM in e820. All other memory is free game.
+	 */
+
+	start_kernel();
+}
diff --git a/arch/x86/kernel/head64-xen.c b/arch/x86/kernel/head64-xen.c
new file mode 100644
index 0000000..b2010d8
--- /dev/null
+++ b/arch/x86/kernel/head64-xen.c
@@ -0,0 +1,146 @@
+/*
+ *  prepare to run common code
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *
+ *  Jun Nakajima <jun.nakajima@intel.com>
+ *	Modified for Xen.
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+#include <linux/start_kernel.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
+
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
+#include <asm/bios_ebda.h>
+
+#ifndef CONFIG_XEN
+static void __init zap_identity_mappings(void)
+{
+	pgd_t *pgd = pgd_offset_k(0UL);
+	pgd_clear(pgd);
+	__flush_tlb_all();
+}
+
+/* Don't add a printk in there. printk relies on the PDA which is not initialized 
+   yet. */
+static void __init clear_bss(void)
+{
+	memset(__bss_start, 0,
+	       (unsigned long) __bss_stop - (unsigned long) __bss_start);
+}
+#endif
+
+static void __init copy_bootdata(char *real_mode_data)
+{
+#ifndef CONFIG_XEN
+	char * command_line;
+
+	memcpy(&boot_params, real_mode_data, sizeof boot_params);
+	if (boot_params.hdr.cmd_line_ptr) {
+		command_line = __va(boot_params.hdr.cmd_line_ptr);
+		memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
+	}
+#else
+	int max_cmdline;
+	
+	if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+		max_cmdline = COMMAND_LINE_SIZE;
+	memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
+	boot_command_line[max_cmdline-1] = '\0';
+#endif
+}
+
+#include <xen/interface/memory.h>
+
+void __init x86_64_start_kernel(char * real_mode_data)
+{
+	/*
+	 * Build-time sanity checks on the kernel image and module
+	 * area mappings. (these are purely build-time and produce no code)
+	 */
+	BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
+	BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
+	BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
+	BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
+	BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
+	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+				(__START_KERNEL & PGDIR_MASK)));
+	BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+
+	xen_start_info = (struct start_info *)real_mode_data;
+	xen_start_kernel();
+
+#ifndef CONFIG_XEN
+	/* clear bss before set_intr_gate with early_idt_handler */
+	clear_bss();
+
+	/* Make NULL pointers segfault */
+	zap_identity_mappings();
+
+	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
+#ifdef CONFIG_EARLY_PRINTK
+		set_intr_gate(i, &early_idt_handlers[i]);
+#else
+		set_intr_gate(i, early_idt_handler);
+#endif
+	}
+	load_idt((const struct desc_ptr *)&idt_descr);
+#endif
+
+	if (console_loglevel == 10)
+		early_printk("Kernel alive\n");
+
+	xen_switch_pt();
+
+	x86_64_start_reservations(real_mode_data);
+}
+
+void __init x86_64_start_reservations(char *real_mode_data)
+{
+	copy_bootdata(__va(real_mode_data));
+
+	memblock_reserve(__pa_symbol(&_text),
+			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* Reserve INITRD if needed. */
+	if (xen_start_info->flags & SIF_MOD_START_PFN) {
+		reserve_pfn_range(xen_start_info->mod_start,
+				  PFN_UP(xen_start_info->mod_len));
+		xen_initrd_start = xen_start_info->mod_start << PAGE_SHIFT;
+	} else if (xen_start_info->mod_start)
+		xen_initrd_start = __pa(xen_start_info->mod_start);
+#endif
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		xen_start_info->mfn_list = ~0UL;
+	else if (xen_start_info->mfn_list < __START_KERNEL_map)
+		reserve_pfn_range(xen_start_info->first_p2m_pfn,
+				  xen_start_info->nr_p2m_frames);
+
+	/*
+	 * At this point everything still needed from the boot loader
+	 * or BIOS or kernel text should be early reserved or marked not
+	 * RAM in e820. All other memory is free game.
+	 */
+
+	start_kernel();
+}
diff --git a/arch/x86/kernel/head_32-xen.S b/arch/x86/kernel/head_32-xen.S
new file mode 100644
index 0000000..c434cef
--- /dev/null
+++ b/arch/x86/kernel/head_32-xen.S
@@ -0,0 +1,220 @@
+
+
+.text
+#include <linux/elfnote.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/boot.h>
+#include <asm/dwarf2.h>
+#include <asm/percpu.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
+
+/*
+ * References to members of the new_cpu_data structure.
+ */
+
+#define X86		new_cpu_data+CPUINFO_x86
+#define X86_VENDOR	new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL	new_cpu_data+CPUINFO_x86_model
+#define X86_MASK	new_cpu_data+CPUINFO_x86_mask
+#define X86_HARD_MATH	new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID	new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY	new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
+
+__HEAD
+#define VIRT_ENTRY_OFFSET 0x0
+.org VIRT_ENTRY_OFFSET
+ENTRY(startup_32)
+	movl %esi,xen_start_info
+	cld
+
+	/* Set up the stack pointer */
+	movl $(init_thread_union+THREAD_SIZE),%esp
+
+	/* get vendor info */
+	xorl %eax,%eax			# call CPUID with 0 -> return vendor ID
+	XEN_CPUID
+	movl %eax,X86_CPUID		# save CPUID level
+	movl %ebx,X86_VENDOR_ID		# lo 4 chars
+	movl %edx,X86_VENDOR_ID+4	# next 4 chars
+	movl %ecx,X86_VENDOR_ID+8	# last 4 chars
+
+	movl $1,%eax		# Use the CPUID instruction to get CPU type
+	XEN_CPUID
+	movb %al,%cl		# save reg for future use
+	andb $0x0f,%ah		# mask processor family
+	movb %ah,X86
+	andb $0xf0,%al		# mask model
+	shrb $4,%al
+	movb %al,X86_MODEL
+	andb $0x0f,%cl		# mask mask revision
+	movb %cl,X86_MASK
+	movl %edx,X86_CAPABILITY
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+	/*
+	 * The linker can't handle this by relocation.  Manually set
+	 * base address in stack canary segment descriptor.
+	 */
+	movl $gdt_page,%eax
+	movl $stack_canary,%ecx
+	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+	shrl $16, %ecx
+	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+	movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
+
+	# %esi still points to start_info, and no registers
+	# need to be preserved.
+
+	movl XEN_START_mfn_list(%esi), %ebx
+	movl $(gdt_page - __PAGE_OFFSET), %eax
+	shrl $PAGE_SHIFT, %eax
+	movl (%ebx,%eax,4), %ecx
+	pushl %ecx			# frame number for set_gdt below
+
+	xorl %esi, %esi
+	xorl %edx, %edx
+	shldl $PAGE_SHIFT, %ecx, %edx
+	shll $PAGE_SHIFT, %ecx
+	orl $_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY, %ecx
+	movl $gdt_page, %ebx
+	movl $__HYPERVISOR_update_va_mapping, %eax
+	int $0x82
+
+	movl $(PAGE_SIZE / 8), %ecx
+	movl %esp, %ebx
+	movl $__HYPERVISOR_set_gdt, %eax
+	int $0x82
+
+	popl %ecx
+
+	movl $(__KERNEL_PERCPU), %eax
+	movl %eax,%fs			# set this cpu's percpu
+
+	movl $(__KERNEL_STACK_CANARY),%eax
+	movl %eax,%gs
+
+	cld			# gcc2 wants the direction flag cleared at all times
+
+	pushl $0		# fake return address for unwinder
+	jmp i386_start_kernel
+
+#define HYPERCALL_PAGE_OFFSET 0x1000
+.org HYPERCALL_PAGE_OFFSET
+ENTRY(hypercall_page)
+	CFI_STARTPROC
+.skip 0x1000
+	CFI_ENDPROC
+
+/*
+ * BSS section
+ */
+__PAGE_ALIGNED_BSS
+	.align PAGE_SIZE
+ENTRY(swapper_pg_fixmap)
+	.fill 1024,4,0
+ENTRY(empty_zero_page)
+	.fill 4096,1,0
+
+/*
+ * This starts the data section.
+ */
+.data
+
+#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST
+# define XEN_DOM0_CAP		0
+# define XEN_DOM0_CAP_STR	""
+#else
+# define XEN_DOM0_CAP		(1 << XENFEAT_dom0)
+# if CONFIG_XEN_COMPAT < 0x040200
+#  define XEN_DOM0_CAP_STR	""
+# else
+#  define XEN_DOM0_CAP_STR	"|dom0"
+# endif
+#endif
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+/*
+ * __xen_guest information
+ */
+.macro utoa value
+ .if (\value) < 0 || (\value) >= 0x10
+	utoa (((\value)>>4)&0x0fffffff)
+ .endif
+ .if ((\value) & 0xf) < 10
+  .byte '0' + ((\value) & 0xf)
+ .else
+  .byte 'A' + ((\value) & 0xf) - 10
+ .endif
+.endm
+
+.section __xen_guest
+	.ascii	"GUEST_OS=linux,GUEST_VER=2.6"
+	.ascii	",XEN_VER=xen-3.0"
+	.ascii	",VIRT_BASE=0x"
+		utoa __PAGE_OFFSET
+	.ascii	",ELF_PADDR_OFFSET=0x"
+		utoa __PAGE_OFFSET
+	.ascii	",VIRT_ENTRY=0x"
+		utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET)
+	.ascii	",HYPERCALL_PAGE=0x"
+		utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
+	.ascii  ",FEATURES=writable_page_tables"
+	.ascii	         "|writable_descriptor_tables"
+	.ascii	         "|auto_translated_physmap"
+	.ascii	         "|pae_pgdir_above_4gb"
+	.ascii	         "|supervisor_mode_kernel"
+#ifdef CONFIG_X86_PAE
+	.ascii	",PAE=yes[extended-cr3]"
+#else
+	.ascii	",PAE=no"
+#endif
+	.ascii	",LOADER=generic"
+	.byte	0
+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
+
+
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long __PAGE_OFFSET)
+#if CONFIG_XEN_COMPAT <= 0x030002
+	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long __PAGE_OFFSET)
+#else
+	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long 0)
+#endif
+	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long startup_32)
+	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
+	ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .long HYPERVISOR_VIRT_START)
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "writable_page_tables";
+						 .ascii "|writable_descriptor_tables";
+						 .ascii "|auto_translated_physmap";
+						 .ascii "|pae_pgdir_above_4gb";
+						 .ascii "|supervisor_mode_kernel";
+						 .asciz XEN_DOM0_CAP_STR)
+	ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long XEN_DOM0_CAP |
+					   (1 << XENFEAT_writable_page_tables) |
+					   (1 << XENFEAT_writable_descriptor_tables) |
+					   (1 << XENFEAT_auto_translated_physmap) |
+					   (1 << XENFEAT_pae_pgdir_above_4gb) |
+					   (1 << XENFEAT_supervisor_mode_kernel))
+#ifdef CONFIG_X86_PAE
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
+	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
+#else
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
+	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .long _PAGE_PRESENT, _PAGE_PRESENT)
+#endif
+	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+	ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
diff --git a/arch/x86/kernel/head_64-xen.S b/arch/x86/kernel/head_64-xen.S
new file mode 100644
index 0000000..c8ce8bd
--- /dev/null
+++ b/arch/x86/kernel/head_64-xen.S
@@ -0,0 +1,176 @@
+/*
+ *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
+ *  Jun Nakajima <jun.nakajima@intel.com>
+ *    Modified for Xen                                
+ */
+
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/elfnote.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+#include <asm/dwarf2.h>
+#include <asm/percpu.h>
+#include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
+
+	__HEAD
+	.code64
+	.globl startup_64
+startup_64:
+	movq $(init_thread_union+THREAD_SIZE-8),%rsp
+
+	/* rsi is pointer to startup info structure.
+	   pass it to C */
+	movq %rsi,%rdi
+
+	/* Set up %gs.
+	 *
+	 * The base of %gs always points to the bottom of the irqstack
+	 * union.  If the stack protector canary is enabled, it is
+	 * located at %gs:40.  Note that, on SMP, the boot cpu uses
+	 * init data section till per cpu areas are set up.
+	 */
+	movl	$MSR_GS_BASE,%ecx
+	movq	$INIT_PER_CPU_VAR(irq_stack_union),%rax
+	movq    %rax,%rdx
+	shrq	$32,%rdx
+	wrmsr
+
+	pushq $0		# fake return address
+	jmp x86_64_start_kernel
+
+#define NEXT_PAGE(name) \
+	.balign	PAGE_SIZE; \
+ENTRY(name)
+
+	__PAGE_ALIGNED_BSS
+NEXT_PAGE(init_level4_pgt)
+	.fill	512,8,0
+
+NEXT_PAGE(level3_kernel_pgt)
+	.fill	512,8,0
+
+        /*
+         * This is used for vsyscall area mapping as we have a different
+         * level4 page table for user.
+         */
+NEXT_PAGE(level3_user_pgt)
+        .fill	512,8,0
+
+NEXT_PAGE(level2_fixmap_pgt)
+	.fill	512,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+	.fill	512,8,0
+
+	.previous
+NEXT_PAGE(hypercall_page)
+	phys_hypercall_page = . - .head.text
+	CFI_STARTPROC
+	.rept 0x1000 / 0x20
+	.skip 1 /* push %rcx */
+	CFI_ADJUST_CFA_OFFSET	8
+	CFI_REL_OFFSET	rcx,0
+	.skip 2 /* push %r11 */
+	CFI_ADJUST_CFA_OFFSET	8
+	CFI_REL_OFFSET	rcx,0
+	.skip 5 /* mov $#,%eax */
+	.skip 2 /* syscall */
+	.skip 2 /* pop %r11 */
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE r11
+	.skip 1 /* pop %rcx */
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE rcx
+	.align 0x20,0 /* ret */
+	.endr
+	CFI_ENDPROC
+
+#undef NEXT_PAGE
+
+	__PAGE_ALIGNED_BSS
+	.align PAGE_SIZE
+ENTRY(empty_zero_page)
+	.skip PAGE_SIZE
+
+#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST
+# define XEN_DOM0_CAP		0
+# define XEN_DOM0_CAP_STR	""
+#else
+# define XEN_DOM0_CAP		(1 << XENFEAT_dom0)
+# if CONFIG_XEN_COMPAT < 0x040200
+#  define XEN_DOM0_CAP_STR	""
+# else
+#  define XEN_DOM0_CAP_STR	"|dom0"
+# endif
+#endif
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+/*
+ * __xen_guest information
+ */
+.macro utoh value
+ i = 64
+ .rept 16
+  i = i - 4
+  .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf)
+ .endr
+.endm
+
+.section __xen_guest
+	.ascii	"GUEST_OS=linux,GUEST_VER=2.6"
+	.ascii	",XEN_VER=xen-3.0"
+	.ascii	",VIRT_BASE=0x"
+		utoh __START_KERNEL_map
+	.ascii	",ELF_PADDR_OFFSET=0x"
+		utoh __START_KERNEL_map
+	.ascii	",VIRT_ENTRY=0x"
+		utoh (__START_KERNEL_map + __PHYSICAL_START)
+	.ascii	",HYPERCALL_PAGE=0x"
+		utoh (phys_hypercall_page >> PAGE_SHIFT)
+	.ascii  ",FEATURES=writable_page_tables"
+	.ascii		 "|writable_descriptor_tables"
+	.ascii		 "|auto_translated_physmap"
+	.ascii	         "|supervisor_mode_kernel"
+	.ascii	",LOADER=generic"
+	.byte	0
+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
+	
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .quad __START_KERNEL_map)
+#if CONFIG_XEN_COMPAT <= 0x030002
+	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad __START_KERNEL_map)
+#else
+	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad 0)
+#endif
+	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad startup_64)
+	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
+	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
+	ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN,  .long 1)
+	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad VMEMMAP_START)
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "writable_page_tables";
+						 .ascii "|writable_descriptor_tables";
+						 .ascii "|auto_translated_physmap";
+						 .ascii "|supervisor_mode_kernel";
+						 .asciz XEN_DOM0_CAP_STR)
+	ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long XEN_DOM0_CAP |
+					   (1 << XENFEAT_writable_page_tables) |
+					   (1 << XENFEAT_writable_descriptor_tables) |
+					   (1 << XENFEAT_auto_translated_physmap) |
+					   (1 << XENFEAT_supervisor_mode_kernel))
+	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+	ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 43e9ccf..c50f863 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -31,6 +31,7 @@ union thread_union init_thread_union __init_task_data =
 struct task_struct init_task = INIT_TASK(init_task);
 EXPORT_SYMBOL(init_task);
 
+#ifndef CONFIG_X86_NO_TSS
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's. The TSS size is kept cacheline-aligned
@@ -39,4 +40,4 @@ EXPORT_SYMBOL(init_task);
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
+#endif
diff --git a/arch/x86/kernel/ioport-xen.c b/arch/x86/kernel/ioport-xen.c
new file mode 100644
index 0000000..3cb400e
--- /dev/null
+++ b/arch/x86/kernel/ioport-xen.c
@@ -0,0 +1,84 @@
+/*
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus. 32/64 bits code unification by Miguel BotÃ³n.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+#include <linux/bitmap.h>
+#include <asm/syscalls.h>
+#include <xen/interface/physdev.h>
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+	struct thread_struct *t = &current->thread;
+	struct physdev_set_iobitmap set_iobitmap;
+
+	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+		return -EINVAL;
+	if (turn_on && !capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	/*
+	 * If it's the first ioperm() call in this thread's lifetime, set the
+	 * IO bitmap up. ioperm() is much less timing critical than clone(),
+	 * this is why we delay this operation until now:
+	 */
+	if (!t->io_bitmap_ptr) {
+		unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+
+		if (!bitmap)
+			return -ENOMEM;
+
+		memset(bitmap, 0xff, IO_BITMAP_BYTES);
+		t->io_bitmap_ptr = bitmap;
+		set_thread_flag(TIF_IO_BITMAP);
+
+		set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
+		set_iobitmap.nr_ports = IO_BITMAP_BITS;
+		WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+					      &set_iobitmap));
+	}
+
+	if (turn_on)
+		bitmap_clear(t->io_bitmap_ptr, from, num);
+	else
+		bitmap_set(t->io_bitmap_ptr, from, num);
+
+	return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ */
+long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+	struct thread_struct *t = &current->thread;
+	unsigned int old = t->iopl >> 12;
+
+	if (level > 3)
+		return -EINVAL;
+	/* Trying to gain more privileges? */
+	if (level > old) {
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+	}
+	t->iopl = level << 12;
+	set_iopl_mask(t->iopl);
+
+	return 0;
+}
diff --git a/arch/x86/kernel/irq-xen.c b/arch/x86/kernel/irq-xen.c
new file mode 100644
index 0000000..e12c762
--- /dev/null
+++ b/arch/x86/kernel/irq-xen.c
@@ -0,0 +1,350 @@
+/*
+ * Common interrupt code for 32 and 64 bit
+ */
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/of.h>
+#include <linux/seq_file.h>
+#include <linux/smp.h>
+#include <linux/ftrace.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/irq.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
+
+#ifndef CONFIG_XEN
+atomic_t irq_err_count;
+
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
+#endif
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+	if (printk_ratelimit())
+		pr_err("unexpected IRQ trap at vector %02x\n", irq);
+
+#ifndef CONFIG_XEN
+	/*
+	 * Currently unexpected vectors happen only on SMP and APIC.
+	 * We _must_ ack these because every local APIC has only N
+	 * irq slots per priority level, and a 'hanging, unacked' IRQ
+	 * holds up an irq slot - in excessive cases (when multiple
+	 * unexpected vectors occur) that might lock up the APIC
+	 * completely.
+	 * But only ack when the APIC is enabled -AK
+	 */
+	ack_APIC_irq();
+#endif
+}
+
+#define irq_stats(x)		(&per_cpu(irq_stat, x))
+/*
+ * /proc/interrupts printing for arch specific interrupts
+ */
+int arch_show_interrupts(struct seq_file *p, int prec)
+{
+	int j;
+
+	seq_printf(p, "%*s: ", prec, "NMI");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
+	seq_printf(p, "  Non-maskable interrupts\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+#ifndef CONFIG_XEN
+	seq_printf(p, "%*s: ", prec, "LOC");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
+	seq_printf(p, "  Local timer interrupts\n");
+
+	seq_printf(p, "%*s: ", prec, "SPU");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+	seq_printf(p, "  Spurious interrupts\n");
+	seq_printf(p, "%*s: ", prec, "PMI");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+	seq_printf(p, "  Performance monitoring interrupts\n");
+#endif
+	seq_printf(p, "%*s: ", prec, "IWI");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+	seq_printf(p, "  IRQ work interrupts\n");
+#ifndef CONFIG_XEN
+	seq_printf(p, "%*s: ", prec, "RTR");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+	seq_printf(p, "  APIC ICR read retries\n");
+#endif
+#endif
+#ifndef CONFIG_XEN
+	if (x86_platform_ipi_callback) {
+		seq_printf(p, "%*s: ", prec, "PLT");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
+		seq_printf(p, "  Platform interrupts\n");
+	}
+#endif
+#ifdef CONFIG_SMP
+	seq_printf(p, "%*s: ", prec, "RES");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+	seq_printf(p, "  Rescheduling interrupts\n");
+	seq_printf(p, "%*s: ", prec, "CAL");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+	seq_printf(p, "  Function call interrupts\n");
+#ifndef CONFIG_XEN
+	seq_printf(p, "%*s: ", prec, "TLB");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+	seq_printf(p, "  TLB shootdowns\n");
+#else
+	seq_printf(p, "%*s: ", prec, "LCK");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_lock_count);
+	seq_printf(p, "  Spinlock wakeups\n");
+#endif
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
+	seq_printf(p, "%*s: ", prec, "TRM");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
+	seq_printf(p, "  Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+	seq_printf(p, "%*s: ", prec, "THR");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
+	seq_printf(p, "  Threshold APIC interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE
+	seq_printf(p, "%*s: ", prec, "MCE");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
+	seq_printf(p, "  Machine check exceptions\n");
+	seq_printf(p, "%*s: ", prec, "MCP");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
+	seq_printf(p, "  Machine check polls\n");
+#endif
+#ifndef CONFIG_XEN
+	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
+#endif
+#endif
+	return 0;
+}
+
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+	u64 sum = irq_stats(cpu)->__nmi_count;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	sum += irq_stats(cpu)->apic_timer_irqs;
+	sum += irq_stats(cpu)->irq_spurious_count;
+	sum += irq_stats(cpu)->apic_perf_irqs;
+	sum += irq_stats(cpu)->apic_irq_work_irqs;
+	sum += irq_stats(cpu)->icr_read_retry_count;
+#endif
+#ifndef CONFIG_XEN
+	if (x86_platform_ipi_callback)
+		sum += irq_stats(cpu)->x86_platform_ipis;
+#endif
+#ifdef CONFIG_SMP
+	sum += irq_stats(cpu)->irq_resched_count;
+	sum += irq_stats(cpu)->irq_call_count;
+#ifndef CONFIG_XEN
+	sum += irq_stats(cpu)->irq_tlb_count;
+#else
+	sum += irq_stats(cpu)->irq_lock_count;
+#endif
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
+	sum += irq_stats(cpu)->irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+	sum += irq_stats(cpu)->irq_threshold_count;
+#endif
+#ifdef CONFIG_X86_MCE
+	sum += per_cpu(mce_exception_count, cpu);
+	sum += per_cpu(mce_poll_count, cpu);
+#endif
+	return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+#ifndef CONFIG_XEN
+	u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+	sum += atomic_read(&irq_mis_count);
+#endif
+	return sum;
+#else
+	return 0;
+#endif
+}
+
+
+#ifndef CONFIG_XEN
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	/* high bit used in ret_from_ code  */
+	unsigned vector = ~regs->orig_ax;
+	unsigned irq;
+
+	irq_enter();
+	exit_idle();
+
+	irq = __this_cpu_read(vector_irq[vector]);
+
+	if (!handle_irq(irq, regs)) {
+		ack_APIC_irq();
+
+		if (printk_ratelimit())
+			pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+				__func__, smp_processor_id(), vector, irq);
+	}
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+	return 1;
+}
+
+/*
+ * Handler for X86_PLATFORM_IPI_VECTOR.
+ */
+void smp_x86_platform_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	irq_enter();
+
+	exit_idle();
+
+	inc_irq_stat(x86_platform_ipis);
+
+	if (x86_platform_ipi_callback)
+		x86_platform_ipi_callback();
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <xen/evtchn.h>
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
+{
+	unsigned int irq;
+	static int warned;
+	struct irq_desc *desc;
+	struct irq_data *data;
+	struct irq_chip *chip;
+	static DECLARE_BITMAP(irqs_used, NR_IRQS);
+
+	for_each_irq_desc(irq, desc) {
+		int break_affinity = 0;
+		int set_affinity = 1;
+		const struct cpumask *affinity;
+
+		if (!desc)
+			continue;
+		if (irq == 2)
+			continue;
+
+		/* interrupt's are disabled at this point */
+		raw_spin_lock(&desc->lock);
+
+		data = irq_desc_get_irq_data(desc);
+		affinity = data->affinity;
+		if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
+		    cpumask_subset(affinity, cpu_online_mask)) {
+			raw_spin_unlock(&desc->lock);
+			continue;
+		}
+
+		if (cpumask_test_cpu(smp_processor_id(), affinity))
+			__set_bit(irq, irqs_used);
+
+		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+			break_affinity = 1;
+			affinity = cpu_all_mask;
+		}
+
+		chip = irq_data_get_irq_chip(data);
+		if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
+			chip->irq_mask(data);
+
+		if (chip->irq_set_affinity)
+			chip->irq_set_affinity(data, affinity, true);
+		else if (data->chip != &no_irq_chip && !(warned++))
+			set_affinity = 0;
+
+		if (!irqd_can_move_in_process_context(data) &&
+		    !irqd_irq_disabled(data) && chip->irq_unmask)
+			chip->irq_unmask(data);
+
+		raw_spin_unlock(&desc->lock);
+
+		if (break_affinity && set_affinity)
+			/*printk("Broke affinity for irq %i\n", irq)*/;
+		else if (!set_affinity)
+			printk("Cannot set affinity for irq %i\n", irq);
+	}
+
+	/*
+	 * We can remove mdelay() and then send spuriuous interrupts to
+	 * new cpu targets for all the irqs that were handled previously by
+	 * this cpu. While it works, I have seen spurious interrupt messages
+	 * (nothing wrong but still...).
+	 *
+	 * So for now, retain mdelay(1) and check the IRR and then send those
+	 * interrupts to new targets as this cpu is already offlined...
+	 */
+	mdelay(1);
+
+	for_each_irq_desc(irq, desc) {
+		if (!__test_and_clear_bit(irq, irqs_used))
+			continue;
+
+		if (xen_test_irq_pending(irq)) {
+			desc = irq_to_desc(irq);
+			data = irq_desc_get_irq_data(desc);
+			chip = irq_data_get_irq_chip(data);
+			raw_spin_lock(&desc->lock);
+			if (chip->irq_retrigger)
+				chip->irq_retrigger(data);
+			raw_spin_unlock(&desc->lock);
+		}
+	}
+}
+#endif
diff --git a/arch/x86/kernel/irq_work-xen.c b/arch/x86/kernel/irq_work-xen.c
new file mode 100644
index 0000000..851414e
--- /dev/null
+++ b/arch/x86/kernel/irq_work-xen.c
@@ -0,0 +1,21 @@
+/*
+ * x86/Xen specific code for irq_work
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/ipi.h>
+
+#ifdef CONFIG_SMP
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+	inc_irq_stat(apic_irq_work_irqs);
+	irq_work_run();
+}
+
+void arch_irq_work_raise(void)
+{
+	xen_send_IPI_self(IRQ_WORK_VECTOR);
+}
+#endif
diff --git a/arch/x86/kernel/ldt-xen.c b/arch/x86/kernel/ldt-xen.c
new file mode 100644
index 0000000..5d56e69
--- /dev/null
+++ b/arch/x86/kernel/ldt-xen.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ *
+ * This handles calls from both 32bit and 64bit mode.
+ */
+
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+#include <asm/syscalls.h>
+
+#ifdef CONFIG_SMP
+static void flush_ldt(void *current_mm)
+{
+	if (current->active_mm == current_mm)
+		load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+{
+	void *oldldt, *newldt;
+	int oldsize;
+
+	if (mincount <= pc->size)
+		return 0;
+	oldsize = pc->size;
+	mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
+			(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
+	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
+		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+	else
+		newldt = (void *)__get_free_page(GFP_KERNEL);
+
+	if (!newldt)
+		return -ENOMEM;
+
+	if (oldsize)
+		memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
+	oldldt = pc->ldt;
+	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
+	       (mincount - oldsize) * LDT_ENTRY_SIZE);
+
+#ifdef CONFIG_X86_64
+	/* CHECKME: Do we really need this ? */
+	wmb();
+#endif
+	pc->ldt = newldt;
+	wmb();
+	pc->size = mincount;
+	wmb();
+
+	if (reload) {
+#ifdef CONFIG_SMP
+		preempt_disable();
+#endif
+		make_pages_readonly(newldt,
+				    (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
+				    XENFEAT_writable_descriptor_tables);
+		load_LDT(pc);
+#ifdef CONFIG_SMP
+		if (!cpumask_equal(mm_cpumask(current->mm),
+				   cpumask_of(smp_processor_id())))
+			smp_call_function(flush_ldt, current->mm, 1);
+		preempt_enable();
+#endif
+	}
+	if (oldsize) {
+		make_pages_writable(oldldt,
+				    (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
+				    XENFEAT_writable_descriptor_tables);
+		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(oldldt);
+		else
+			put_page(virt_to_page(oldldt));
+	}
+	return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+	int err = alloc_ldt(new, old->size, 0);
+
+	if (err < 0)
+		return err;
+	memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
+	make_pages_readonly(new->ldt,
+			    (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+			    XENFEAT_writable_descriptor_tables);
+	return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct mm_struct *old_mm;
+	int retval = 0;
+
+	memset(&mm->context, 0, sizeof(mm->context));
+	mutex_init(&mm->context.lock);
+	old_mm = current->mm;
+	if (old_mm)
+		mm->context.vdso = old_mm->context.vdso;
+	if (old_mm && old_mm->context.size > 0) {
+		mutex_lock(&old_mm->context.lock);
+		retval = copy_ldt(&mm->context, &old_mm->context);
+		mutex_unlock(&old_mm->context.lock);
+	}
+	return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ *
+ * 64bit: Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+	if (mm->context.size) {
+		/* CHECKME: Can this ever happen ? */
+		if (mm == current->active_mm)
+			clear_LDT();
+		make_pages_writable(mm->context.ldt,
+				    (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+				    XENFEAT_writable_descriptor_tables);
+		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
+			vfree(mm->context.ldt);
+		else
+			put_page(virt_to_page(mm->context.ldt));
+		mm->context.size = 0;
+	}
+}
+
+static int read_ldt(void __user *ptr, unsigned long bytecount)
+{
+	int err;
+	unsigned long size;
+	struct mm_struct *mm = current->mm;
+
+	if (!mm->context.size)
+		return 0;
+	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
+
+	mutex_lock(&mm->context.lock);
+	size = mm->context.size * LDT_ENTRY_SIZE;
+	if (size > bytecount)
+		size = bytecount;
+
+	err = 0;
+	if (copy_to_user(ptr, mm->context.ldt, size))
+		err = -EFAULT;
+	mutex_unlock(&mm->context.lock);
+	if (err < 0)
+		goto error_return;
+	if (size != bytecount) {
+		/* zero-fill the rest */
+		if (clear_user(ptr + size, bytecount - size) != 0) {
+			err = -EFAULT;
+			goto error_return;
+		}
+	}
+	return bytecount;
+error_return:
+	return err;
+}
+
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
+{
+	/* CHECKME: Can we use _one_ random number ? */
+#ifdef CONFIG_X86_32
+	unsigned long size = 5 * sizeof(struct desc_struct);
+#else
+	unsigned long size = 128;
+#endif
+	if (bytecount > size)
+		bytecount = size;
+	if (clear_user(ptr, bytecount))
+		return -EFAULT;
+	return bytecount;
+}
+
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
+{
+	struct mm_struct *mm = current->mm;
+	struct desc_struct ldt;
+	int error;
+	struct user_desc ldt_info;
+
+	error = -EINVAL;
+	if (bytecount != sizeof(ldt_info))
+		goto out;
+	error = -EFAULT;
+	if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+		goto out;
+
+	error = -EINVAL;
+	if (ldt_info.entry_number >= LDT_ENTRIES)
+		goto out;
+	if (ldt_info.contents == 3) {
+		if (oldmode)
+			goto out;
+		if (ldt_info.seg_not_present == 0)
+			goto out;
+	}
+
+	mutex_lock(&mm->context.lock);
+	if (ldt_info.entry_number >= mm->context.size) {
+		error = alloc_ldt(&current->mm->context,
+				  ldt_info.entry_number + 1, 1);
+		if (error < 0)
+			goto out_unlock;
+	}
+
+	/* Allow LDTs to be cleared by the user. */
+	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+		if (oldmode || LDT_empty(&ldt_info)) {
+			memset(&ldt, 0, sizeof(ldt));
+			goto install;
+		}
+	}
+
+	fill_ldt(&ldt, &ldt_info);
+	if (oldmode)
+		ldt.avl = 0;
+
+	/* Install the new entry ...  */
+install:
+	error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+
+out_unlock:
+	mutex_unlock(&mm->context.lock);
+out:
+	return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+			      unsigned long bytecount)
+{
+	int ret = -ENOSYS;
+
+	switch (func) {
+	case 0:
+		ret = read_ldt(ptr, bytecount);
+		break;
+	case 1:
+		ret = write_ldt(ptr, bytecount, 1);
+		break;
+	case 2:
+		ret = read_default_ldt(ptr, bytecount);
+		break;
+	case 0x11:
+		ret = write_ldt(ptr, bytecount, 0);
+		break;
+	}
+	return ret;
+}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index a3fa43b..6d5a5d9 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -27,47 +27,9 @@
 #include <asm/cacheflush.h>
 #include <asm/debugreg.h>
 
-static void set_idt(void *newidt, __u16 limit)
-{
-	struct desc_ptr curidt;
-
-	/* ia32 supports unaliged loads & stores */
-	curidt.size    = limit;
-	curidt.address = (unsigned long)newidt;
-
-	load_idt(&curidt);
-}
-
-
-static void set_gdt(void *newgdt, __u16 limit)
-{
-	struct desc_ptr curgdt;
-
-	/* ia32 supports unaligned loads & stores */
-	curgdt.size    = limit;
-	curgdt.address = (unsigned long)newgdt;
-
-	load_gdt(&curgdt);
-}
-
-static void load_segments(void)
-{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
-	__asm__ __volatile__ (
-		"\tljmp $"STR(__KERNEL_CS)",$1f\n"
-		"\t1:\n"
-		"\tmovl $"STR(__KERNEL_DS)",%%eax\n"
-		"\tmovl %%eax,%%ds\n"
-		"\tmovl %%eax,%%es\n"
-		"\tmovl %%eax,%%fs\n"
-		"\tmovl %%eax,%%gs\n"
-		"\tmovl %%eax,%%ss\n"
-		: : : "eax", "memory");
-#undef STR
-#undef __STR
-}
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
 
 static void machine_kexec_free_page_tables(struct kimage *image)
 {
@@ -84,6 +46,17 @@ static int machine_kexec_alloc_page_tables(struct kimage *image)
 {
 	image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
 #ifdef CONFIG_X86_PAE
+#ifdef CONFIG_XEN /* machine address must fit into xki->page_list[PA_PGD] */
+	if (image->arch.pgd) {
+		struct page *pg = virt_to_page(image->arch.pgd);
+
+		if (xen_limit_pages_to_max_mfn(pg, 0, BITS_PER_LONG) < 0) {
+			image->arch.pgd = NULL;
+			__free_page(pg);
+			return -ENOMEM;
+		}
+	}
+#endif
 	image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 	image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 #endif
@@ -139,6 +112,38 @@ static void machine_kexec_prepare_page_tables(struct kimage *image)
 		__pa(control_page), __pa(control_page));
 }
 
+#ifdef CONFIG_XEN
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	void *control_page;
+
+	memset(xki->page_list, 0, sizeof(xki->page_list));
+
+	control_page = page_address(image->control_code_page);
+	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+	xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+	xki->page_list[PA_PGD] = __ma(image->arch.pgd);
+
+	if (image->type == KEXEC_TYPE_DEFAULT)
+		xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
+}
+
+#include "machine_kexec_xen.c"
+
+#endif /* CONFIG_XEN */
+
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
@@ -176,6 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
 	machine_kexec_free_page_tables(image);
 }
 
+#ifndef CONFIG_XEN
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
@@ -228,24 +234,6 @@ void machine_kexec(struct kimage *image)
 		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 						<< PAGE_SHIFT);
 
-	/*
-	 * The segment registers are funny things, they have both a
-	 * visible and an invisible part.  Whenever the visible part is
-	 * set to a specific selector, the invisible part is loaded
-	 * with from a table in memory.  At no other time is the
-	 * descriptor table in memory accessed.
-	 *
-	 * I take advantage of this here by force loading the
-	 * segments, before I zap the gdt with an invalid value.
-	 */
-	load_segments();
-	/*
-	 * The gdt & idt are now invalid.
-	 * If you want to load them you must set up your own idt & gdt.
-	 */
-	set_gdt(phys_to_virt(0), 0);
-	set_idt(phys_to_virt(0), 0);
-
 	/* now call it */
 	image->start = relocate_kernel_ptr((unsigned long)image->head,
 					   (unsigned long)page_list,
@@ -259,6 +247,7 @@ void machine_kexec(struct kimage *image)
 
 	__ftrace_enabled_restore(save_ftrace_enabled);
 }
+#endif
 
 void arch_crash_save_vmcoreinfo(void)
 {
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db..d8d77db 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -21,6 +21,101 @@
 #include <asm/mmu_context.h>
 #include <asm/debugreg.h>
 
+#ifdef CONFIG_XEN
+
+/* In the case of Xen, override hypervisor functions to be able to create
+ * a regular identity mapping page table...
+ */
+
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+
+#define x__pmd(x) ((pmd_t) { (x) } )
+#define x__pud(x) ((pud_t) { (x) } )
+#define x__pgd(x) ((pgd_t) { (x) } )
+
+#define x_pmd_val(x)   ((x).pmd)
+#define x_pud_val(x)   ((x).pud)
+#define x_pgd_val(x)   ((x).pgd)
+
+static inline void x_set_pmd(pmd_t *dst, pmd_t val)
+{
+	x_pmd_val(*dst) = x_pmd_val(val);
+}
+
+static inline void x_set_pud(pud_t *dst, pud_t val)
+{
+	x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
+}
+
+static inline void x_pud_clear (pud_t *pud)
+{
+	x_pud_val(*pud) = 0;
+}
+
+static inline void x_set_pgd(pgd_t *dst, pgd_t val)
+{
+	x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
+}
+
+static inline void x_pgd_clear (pgd_t * pgd)
+{
+	x_pgd_val(*pgd) = 0;
+}
+
+#define X__PAGE_KERNEL_LARGE_EXEC \
+         _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
+#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	void *control_page;
+	void *table_page;
+
+	memset(xki->page_list, 0, sizeof(xki->page_list));
+
+	control_page = page_address(image->control_code_page) + PAGE_SIZE;
+	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+	table_page = page_address(image->control_code_page);
+
+	xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+	xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
+
+	if (image->type == KEXEC_TYPE_DEFAULT)
+		xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
+}
+
+#include "machine_kexec_xen.c"
+
+#else /* CONFIG_XEN */
+
+#define x__pmd(x) __pmd(x)
+#define x__pud(x) __pud(x)
+#define x__pgd(x) __pgd(x)
+
+#define x_set_pmd(x, y) set_pmd(x, y)
+#define x_set_pud(x, y) set_pud(x, y)
+#define x_set_pgd(x, y) set_pgd(x, y)
+
+#define x_pud_clear(x) pud_clear(x)
+#define x_pgd_clear(x) pgd_clear(x)
+
+#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#define X_KERNPG_TABLE _KERNPG_TABLE
+
+#endif /* CONFIG_XEN */
+
 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 				unsigned long addr)
 {
@@ -50,7 +145,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
 	}
 	pmd = pmd_offset(pud, addr);
 	if (!pmd_present(*pmd))
-		set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+		x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
 	result = 0;
 out:
 	return result;
@@ -63,7 +158,7 @@ static void init_level2_page(pmd_t *level2p, unsigned long addr)
 	addr &= PAGE_MASK;
 	end_addr = addr + PUD_SIZE;
 	while (addr < end_addr) {
-		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+		x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
 		addr += PMD_SIZE;
 	}
 }
@@ -88,12 +183,12 @@ static int init_level3_page(struct kimage *image, pud_t *level3p,
 		}
 		level2p = (pmd_t *)page_address(page);
 		init_level2_page(level2p, addr);
-		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+		x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
 		addr += PUD_SIZE;
 	}
 	/* clear the unused entries */
 	while (addr < end_addr) {
-		pud_clear(level3p++);
+		x_pud_clear(level3p++);
 		addr += PUD_SIZE;
 	}
 out:
@@ -123,12 +218,12 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
 		result = init_level3_page(image, level3p, addr, last_addr);
 		if (result)
 			goto out;
-		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+		x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
 		addr += PGDIR_SIZE;
 	}
 	/* clear the unused entries */
 	while (addr < end_addr) {
-		pgd_clear(level4p++);
+		x_pgd_clear(level4p++);
 		addr += PGDIR_SIZE;
 	}
 out:
@@ -189,8 +284,14 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
 	pgd_t *level4p;
 	int result;
+	unsigned long x_max_pfn = max_pfn;
+
+#ifdef CONFIG_XEN
+	x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+#endif
+
 	level4p = (pgd_t *)__va(start_pgtable);
-	result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
+	result = init_level4_page(image, level4p, 0, x_max_pfn << PAGE_SHIFT);
 	if (result)
 		return result;
 	/*
@@ -203,47 +304,6 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 	return init_transition_pgtable(image, level4p);
 }
 
-static void set_idt(void *newidt, u16 limit)
-{
-	struct desc_ptr curidt;
-
-	/* x86-64 supports unaliged loads & stores */
-	curidt.size    = limit;
-	curidt.address = (unsigned long)newidt;
-
-	__asm__ __volatile__ (
-		"lidtq %0\n"
-		: : "m" (curidt)
-		);
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
-	struct desc_ptr curgdt;
-
-	/* x86-64 supports unaligned loads & stores */
-	curgdt.size    = limit;
-	curgdt.address = (unsigned long)newgdt;
-
-	__asm__ __volatile__ (
-		"lgdtq %0\n"
-		: : "m" (curgdt)
-		);
-};
-
-static void load_segments(void)
-{
-	__asm__ __volatile__ (
-		"\tmovl %0,%%ds\n"
-		"\tmovl %0,%%es\n"
-		"\tmovl %0,%%ss\n"
-		"\tmovl %0,%%fs\n"
-		"\tmovl %0,%%gs\n"
-		: : "a" (__KERNEL_DS) : "memory"
-		);
-}
-
 int machine_kexec_prepare(struct kimage *image)
 {
 	unsigned long start_pgtable;
@@ -265,6 +325,7 @@ void machine_kexec_cleanup(struct kimage *image)
 	free_transition_pgtable(image);
 }
 
+#ifndef CONFIG_XEN
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
@@ -311,24 +372,6 @@ void machine_kexec(struct kimage *image)
 		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 						<< PAGE_SHIFT);
 
-	/*
-	 * The segment registers are funny things, they have both a
-	 * visible and an invisible part.  Whenever the visible part is
-	 * set to a specific selector, the invisible part is loaded
-	 * with from a table in memory.  At no other time is the
-	 * descriptor table in memory accessed.
-	 *
-	 * I take advantage of this here by force loading the
-	 * segments, before I zap the gdt with an invalid value.
-	 */
-	load_segments();
-	/*
-	 * The gdt & idt are now invalid.
-	 * If you want to load them you must set up your own idt & gdt.
-	 */
-	set_gdt(phys_to_virt(0), 0);
-	set_idt(phys_to_virt(0), 0);
-
 	/* now call it */
 	image->start = relocate_kernel((unsigned long)image->head,
 				       (unsigned long)page_list,
@@ -342,10 +385,13 @@ void machine_kexec(struct kimage *image)
 
 	__ftrace_enabled_restore(save_ftrace_enabled);
 }
+#endif
 
 void arch_crash_save_vmcoreinfo(void)
 {
+#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
 	VMCOREINFO_SYMBOL(phys_base);
+#endif
 	VMCOREINFO_SYMBOL(init_level4_pgt);
 
 #ifdef CONFIG_NUMA
diff --git a/arch/x86/kernel/machine_kexec_xen.c b/arch/x86/kernel/machine_kexec_xen.c
new file mode 100644
index 0000000..b171ce6
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_xen.c
@@ -0,0 +1,29 @@
+int machine_kexec_setup_resource(struct resource *hypervisor,
+				 struct resource *phys_cpu)
+{
+	/* The per-cpu crash note resources belong to the hypervisor resource */
+	insert_resource(hypervisor, phys_cpu);
+	if (!phys_cpu->parent) /* outside of hypervisor range */
+		insert_resource(&iomem_resource, phys_cpu);
+
+	return 0;
+}
+
+int __init machine_kexec_setup_resources(struct resource *hypervisor,
+					 struct resource *phys_cpus,
+					 int nr_phys_cpus)
+{
+	unsigned int k;
+
+	insert_resource(&iomem_resource, hypervisor);
+	if (crashk_res.end > crashk_res.start)
+		insert_resource(&iomem_resource, &crashk_res);
+
+	for (k = 0; k < nr_phys_cpus; k++)
+		machine_kexec_setup_resource(hypervisor, phys_cpus + k);
+
+	return xen_create_contiguous_region((unsigned long)&vmcoreinfo_note,
+					    get_order(sizeof(vmcoreinfo_note)),
+					    BITS_PER_LONG);
+
+}
diff --git a/arch/x86/kernel/microcode_core-xen.c b/arch/x86/kernel/microcode_core-xen.c
new file mode 100644
index 0000000..4416e2d
--- /dev/null
+++ b/arch/x86/kernel/microcode_core-xen.c
@@ -0,0 +1,299 @@
+/*
+ *	CPU Microcode Update Driver for Linux on Xen
+ *
+ *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *		      2006	Shaohua Li <shaohua.li@intel.com>
+ *
+ *	This driver allows to upgrade microcode on Intel processors
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
+ *	Pentium III, Xeon, Pentium 4, etc.
+ *
+ *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *	Software Developer's Manual
+ *	Order Number 253668 or free download from:
+ *
+ *	http://developer.intel.com/Assets/PDF/manual/253668.pdf
+ *
+ *	For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cpu_device_id.h>
+
+#include <xen/pcpu.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+static int verbose;
+module_param(verbose, int, 0644);
+
+#define MICROCODE_VERSION	"2.00-xen"
+
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ *   the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
+static DEFINE_MUTEX(microcode_mutex);
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static int do_microcode_update(const void __user *ubuf, size_t len)
+{
+	int err;
+	void *kbuf;
+
+	kbuf = vmalloc(len);
+	if (!kbuf)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, ubuf, len) == 0) {
+		struct xen_platform_op op;
+
+		op.cmd = XENPF_microcode_update;
+		set_xen_guest_handle(op.u.microcode.data, kbuf);
+		op.u.microcode.length = len;
+		err = HYPERVISOR_platform_op(&op);
+	} else
+		err = -EFAULT;
+
+	vfree(kbuf);
+
+	return err;
+}
+
+static int microcode_open(struct inode *inode, struct file *file)
+{
+	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
+}
+
+static ssize_t microcode_write(struct file *file, const char __user *buf,
+			       size_t len, loff_t *ppos)
+{
+	ssize_t ret = -EINVAL;
+
+	if ((len >> PAGE_SHIFT) > totalram_pages) {
+		pr_err("too much data (max %ld pages)\n", totalram_pages);
+		return ret;
+ 	}
+
+	mutex_lock(&microcode_mutex);
+
+	if (do_microcode_update(buf, len) == 0)
+		ret = (ssize_t)len;
+
+	mutex_unlock(&microcode_mutex);
+
+	return ret;
+}
+
+static const struct file_operations microcode_fops = {
+	.owner			= THIS_MODULE,
+	.write			= microcode_write,
+	.open			= microcode_open,
+	.llseek		= no_llseek,
+};
+
+static struct miscdevice microcode_dev = {
+	.minor			= MICROCODE_MINOR,
+	.name			= "microcode",
+	.nodename		= "cpu/microcode",
+	.fops			= &microcode_fops,
+};
+
+static int __init microcode_dev_init(void)
+{
+	int error;
+
+	if (!is_initial_xendomain())
+		return -ENODEV;
+
+	error = misc_register(&microcode_dev);
+	if (error) {
+		pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
+		return error;
+	}
+
+	return 0;
+}
+
+static void __exit microcode_dev_exit(void)
+{
+	misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+MODULE_ALIAS("devname:cpu/microcode");
+#else
+#define microcode_dev_init()	0
+#define microcode_dev_exit()	do { } while (0)
+#endif
+
+/* fake device for request_firmware */
+static struct platform_device	*microcode_pdev;
+
+static int request_microcode(const char *name)
+{
+	const struct firmware *firmware;
+	int error;
+	struct xen_platform_op op;
+
+	error = request_firmware(&firmware, name, &microcode_pdev->dev);
+	if (error) {
+		pr_debug("microcode: data file %s load failed\n", name);
+		return error;
+	}
+
+	op.cmd = XENPF_microcode_update;
+	set_xen_guest_handle(op.u.microcode.data, firmware->data);
+	op.u.microcode.length = firmware->size;
+	error = HYPERVISOR_platform_op(&op);
+
+	release_firmware(firmware);
+
+	if (error)
+		pr_debug("ucode load failed\n");
+
+	return error;
+}
+
+static const char amd_uc_name[] = "amd-ucode/microcode_amd.bin";
+static const char amd_uc_fmt[] = "amd-ucode/microcode_amd_fam%x.bin";
+static const char intel_uc_fmt[] = "intel-ucode/%02x-%02x-%02x";
+
+static int ucode_cpu_callback(struct notifier_block *nfb,
+			      unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct xen_platform_op op;
+	char buf[36];
+	const char *uc_name = buf;
+
+	switch (action) {
+	case CPU_ONLINE:
+		op.cmd = XENPF_get_cpu_version;
+		op.u.pcpu_version.xen_cpuid = cpu;
+		if (HYPERVISOR_platform_op(&op))
+			break;
+		if (op.u.pcpu_version.family == boot_cpu_data.x86
+		    && op.u.pcpu_version.model == boot_cpu_data.x86_model
+		    && op.u.pcpu_version.stepping == boot_cpu_data.x86_mask)
+			break;
+		if (strncmp(op.u.pcpu_version.vendor_id,
+			    "GenuineIntel", 12) == 0)
+			snprintf(buf, sizeof(buf), intel_uc_fmt,
+				 op.u.pcpu_version.family,
+				 op.u.pcpu_version.model,
+				 op.u.pcpu_version.stepping);
+		else if (strncmp(op.u.pcpu_version.vendor_id,
+				 "AuthenicAMD", 12) == 0) {
+			if (op.u.pcpu_version.family >= 0x15)
+				snprintf(buf, sizeof(buf), amd_uc_fmt,
+					 op.u.pcpu_version.family);
+			else
+				uc_name = amd_uc_name;
+		} else
+			break;
+		request_microcode(uc_name);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ucode_cpu_notifier = {
+	.notifier_call = ucode_cpu_callback
+};
+
+#ifdef MODULE
+/* Autoload on Intel and AMD systems */
+static const struct x86_cpu_id microcode_id[] = {
+	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
+	{ X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, microcode_id);
+#endif
+
+static int __init microcode_init(void)
+{
+	const struct cpuinfo_x86 *c = &boot_cpu_data;
+	char buf[36];
+	const char *fw_name = buf;
+	int error;
+
+	if (c->x86_vendor == X86_VENDOR_INTEL)
+		snprintf(buf, sizeof(buf), intel_uc_fmt,
+			 c->x86, c->x86_model, c->x86_mask);
+	else if (c->x86_vendor == X86_VENDOR_AMD) {
+		if (c->x86 >= 0x15)
+			snprintf(buf, sizeof(buf), amd_uc_fmt, c->x86);
+		else
+			fw_name = amd_uc_name;
+	} else {
+		pr_err("no support for this CPU vendor\n");
+		return -ENODEV;
+	}
+
+	microcode_pdev = platform_device_register_simple("microcode", -1,
+							 NULL, 0);
+	if (IS_ERR(microcode_pdev))
+		return PTR_ERR(microcode_pdev);
+
+	request_microcode(fw_name);
+
+	error = microcode_dev_init();
+	if (error) {
+		platform_device_unregister(microcode_pdev);
+		return error;
+	}
+
+	pr_info("Microcode Update Driver: v" MICROCODE_VERSION
+		" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
+
+	error = register_pcpu_notifier(&ucode_cpu_notifier);
+	if (error)
+		pr_warn("pCPU notifier registration failed (%d)\n", error);
+
+	return 0;
+}
+module_init(microcode_init);
+
+static void __exit microcode_exit(void)
+{
+	unregister_pcpu_notifier(&ucode_cpu_notifier);
+	microcode_dev_exit();
+	platform_device_unregister(microcode_pdev);
+
+	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+}
+module_exit(microcode_exit);
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index ac861b8..2c88c6a 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -205,12 +205,20 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
 		return;
 	}
 
+#ifndef CONFIG_XEN
 	printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
 	val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
 	     (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT));
 	val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
 	       FAM10H_MMIO_CONF_ENABLE;
 	wrmsrl(address, val);
+#else
+	if ((val & ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
+	     (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT)))
+	    != (fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+	 	FAM10H_MMIO_CONF_ENABLE))
+		pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
+#endif
 }
 
 static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
diff --git a/arch/x86/kernel/mpparse-xen.c b/arch/x86/kernel/mpparse-xen.c
new file mode 100644
index 0000000..5a54f95
--- /dev/null
+++ b/arch/x86/kernel/mpparse-xen.c
@@ -0,0 +1,962 @@
+/*
+ *	Intel Multiprocessor Specification 1.1 and 1.4
+ *	compliant MP-table parsing routines.
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ *	(c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/pci.h>
+
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/bios_ebda.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
+
+#include <asm/apic.h>
+
+static void *_bus_to_virt(unsigned long ma)
+{
+	return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
+}
+
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+	int sum = 0;
+
+	while (len--)
+		sum += *mp++;
+
+	return sum & 0xFF;
+}
+
+#ifndef CONFIG_XEN
+int __init default_mpc_apic_id(struct mpc_cpu *m)
+{
+	return m->apicid;
+}
+#endif
+
+static void __init MP_processor_info(struct mpc_cpu *m)
+{
+#ifndef CONFIG_XEN
+	int apicid;
+	char *bootup_cpu = "";
+
+	if (!(m->cpuflag & CPU_ENABLED)) {
+		disabled_cpus++;
+		return;
+	}
+
+	apicid = x86_init.mpparse.mpc_apic_id(m);
+
+	if (m->cpuflag & CPU_BOOTPROCESSOR) {
+		bootup_cpu = " (Bootup-CPU)";
+		boot_cpu_physical_apicid = m->apicid;
+	}
+
+	printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
+	generic_processor_info(apicid, m->apicver);
+#else /* CONFIG_XEN */
+	num_processors++;
+#endif
+}
+
+#ifdef CONFIG_X86_IO_APIC
+void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
+{
+	memcpy(str, m->bustype, 6);
+	str[6] = 0;
+	apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+}
+
+static void __init MP_bus_info(struct mpc_bus *m)
+{
+	char str[7];
+
+	x86_init.mpparse.mpc_oem_bus_info(m, str);
+
+#if MAX_MP_BUSSES < 256
+	if (m->busid >= MAX_MP_BUSSES) {
+		printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
+		       " is too large, max. supported is %d\n",
+		       m->busid, str, MAX_MP_BUSSES - 1);
+		return;
+	}
+#endif
+
+	set_bit(m->busid, mp_bus_not_pci);
+	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+		mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
+#endif
+	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
+		if (x86_init.mpparse.mpc_oem_pci_bus)
+			x86_init.mpparse.mpc_oem_pci_bus(m);
+
+		clear_bit(m->busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+		mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
+	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
+		mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
+	} else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
+		mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
+#endif
+	} else
+		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+}
+
+static void __init MP_ioapic_info(struct mpc_ioapic *m)
+{
+	if (m->flags & MPC_APIC_USABLE)
+		mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
+}
+
+static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
+{
+	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+		mp_irq->irqtype, mp_irq->irqflag & 3,
+		(mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
+		mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
+}
+
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
+{
+	apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
+		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+		m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
+		m->srcbusirq, m->destapic, m->destapiclint);
+}
+
+/*
+ * Read/parse the MPC
+ */
+static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
+{
+
+	if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
+		printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+		       mpc->signature[0], mpc->signature[1],
+		       mpc->signature[2], mpc->signature[3]);
+		return 0;
+	}
+	if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
+		printk(KERN_ERR "MPTABLE: checksum error!\n");
+		return 0;
+	}
+	if (mpc->spec != 0x01 && mpc->spec != 0x04) {
+		printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
+		       mpc->spec);
+		return 0;
+	}
+	if (!mpc->lapic) {
+		printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+		return 0;
+	}
+	memcpy(oem, mpc->oem, 8);
+	oem[8] = 0;
+	printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
+
+	memcpy(str, mpc->productid, 12);
+	str[12] = 0;
+
+	printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
+
+#ifndef CONFIG_XEN
+	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+#endif
+
+	return 1;
+}
+
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+	*ptr += size;
+	*count += size;
+}
+
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+	printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
+		"type %x\n", *mpt);
+	print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+			1, mpc, mpc->length, 1);
+}
+
+void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+
+static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
+{
+	char str[16];
+	char oem[10];
+
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;
+
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_32
+	generic_mps_oem_check(mpc, oem, str);
+#endif
+	/* Initialize the lapic mapping */
+	if (!acpi_lapic)
+		register_lapic_address(mpc->lapic);
+#endif
+
+	if (early)
+		return 1;
+
+	if (mpc->oemptr)
+		x86_init.mpparse.smp_read_mpc_oem(mpc);
+
+	/*
+	 *      Now process the configuration blocks.
+	 */
+	x86_init.mpparse.mpc_record(0);
+
+	while (count < mpc->length) {
+		switch (*mpt) {
+		case MP_PROCESSOR:
+			/* ACPI may have already provided this data */
+			if (!acpi_lapic)
+				MP_processor_info((struct mpc_cpu *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+			break;
+		case MP_BUS:
+			MP_bus_info((struct mpc_bus *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+			break;
+		case MP_IOAPIC:
+			MP_ioapic_info((struct mpc_ioapic *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+			break;
+		case MP_INTSRC:
+			mp_save_irq((struct mpc_intsrc *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+			break;
+		case MP_LINTSRC:
+			MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+			skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+			break;
+		default:
+			/* wrong mptable */
+			smp_dump_mptable(mpc, mpt);
+			count = mpc->length;
+			break;
+		}
+		x86_init.mpparse.mpc_record(1);
+	}
+
+	if (!num_processors)
+		printk(KERN_ERR "MPTABLE: no processors registered!\n");
+	return num_processors;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+	unsigned int port;
+
+	port = 0x4d0 + (irq >> 3);
+	return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+	struct mpc_intsrc intsrc;
+	int i;
+	int ELCR_fallback = 0;
+
+	intsrc.type = MP_INTSRC;
+	intsrc.irqflag = 0;	/* conforming */
+	intsrc.srcbus = 0;
+	intsrc.dstapic = mpc_ioapic_id(0);
+
+	intsrc.irqtype = mp_INT;
+
+	/*
+	 *  If true, we have an ISA/PCI system with no IRQ entries
+	 *  in the MP table. To prevent the PCI interrupts from being set up
+	 *  incorrectly, we try to use the ELCR. The sanity check to see if
+	 *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+	 *  never be level sensitive, so we simply see if the ELCR agrees.
+	 *  If it does, we assume it's valid.
+	 */
+	if (mpc_default_type == 5) {
+		printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
+		       "falling back to ELCR\n");
+
+		if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
+		    ELCR_trigger(13))
+			printk(KERN_ERR "ELCR contains invalid data... "
+			       "not using ELCR\n");
+		else {
+			printk(KERN_INFO
+			       "Using ELCR to identify PCI interrupts\n");
+			ELCR_fallback = 1;
+		}
+	}
+
+	for (i = 0; i < 16; i++) {
+		switch (mpc_default_type) {
+		case 2:
+			if (i == 0 || i == 13)
+				continue;	/* IRQ0 & IRQ13 not connected */
+			/* fall through */
+		default:
+			if (i == 2)
+				continue;	/* IRQ2 is never connected */
+		}
+
+		if (ELCR_fallback) {
+			/*
+			 *  If the ELCR indicates a level-sensitive interrupt, we
+			 *  copy that information over to the MP table in the
+			 *  irqflag field (level sensitive, active high polarity).
+			 */
+			if (ELCR_trigger(i))
+				intsrc.irqflag = 13;
+			else
+				intsrc.irqflag = 0;
+		}
+
+		intsrc.srcbusirq = i;
+		intsrc.dstirq = i ? i : 2;	/* IRQ0 to INTIN2 */
+		mp_save_irq(&intsrc);
+	}
+
+	intsrc.irqtype = mp_ExtINT;
+	intsrc.srcbusirq = 0;
+	intsrc.dstirq = 0;	/* 8259A to INTIN0 */
+	mp_save_irq(&intsrc);
+}
+
+
+static void __init construct_ioapic_table(int mpc_default_type)
+{
+	struct mpc_ioapic ioapic;
+	struct mpc_bus bus;
+
+	bus.type = MP_BUS;
+	bus.busid = 0;
+	switch (mpc_default_type) {
+	default:
+		printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+		       mpc_default_type);
+		/* fall through */
+	case 1:
+	case 5:
+		memcpy(bus.bustype, "ISA   ", 6);
+		break;
+	case 2:
+	case 6:
+	case 3:
+		memcpy(bus.bustype, "EISA  ", 6);
+		break;
+	case 4:
+	case 7:
+		memcpy(bus.bustype, "MCA   ", 6);
+	}
+	MP_bus_info(&bus);
+	if (mpc_default_type > 4) {
+		bus.busid = 1;
+		memcpy(bus.bustype, "PCI   ", 6);
+		MP_bus_info(&bus);
+	}
+
+	ioapic.type	= MP_IOAPIC;
+	ioapic.apicid	= 2;
+	ioapic.apicver	= mpc_default_type > 4 ? 0x10 : 0x01;
+	ioapic.flags	= MPC_APIC_USABLE;
+	ioapic.apicaddr	= IO_APIC_DEFAULT_PHYS_BASE;
+	MP_ioapic_info(&ioapic);
+
+	/*
+	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
+	 */
+	construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void __init construct_ioapic_table(int mpc_default_type) { }
+#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+	struct mpc_cpu processor;
+	struct mpc_lintsrc lintsrc;
+	int linttypes[2] = { mp_ExtINT, mp_NMI };
+	int i;
+
+#ifndef CONFIG_XEN
+	/*
+	 * local APIC has default address
+	 */
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+#endif
+
+	/*
+	 * 2 CPUs, numbered 0 & 1.
+	 */
+	processor.type = MP_PROCESSOR;
+	/* Either an integrated APIC or a discrete 82489DX. */
+	processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.cpuflag = CPU_ENABLED;
+	processor.cpufeature = (boot_cpu_data.x86 << 8) |
+	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+	processor.featureflag = boot_cpu_data.x86_capability[0];
+	processor.reserved[0] = 0;
+	processor.reserved[1] = 0;
+	for (i = 0; i < 2; i++) {
+		processor.apicid = i;
+		MP_processor_info(&processor);
+	}
+
+	construct_ioapic_table(mpc_default_type);
+
+	lintsrc.type = MP_LINTSRC;
+	lintsrc.irqflag = 0;		/* conforming */
+	lintsrc.srcbusid = 0;
+	lintsrc.srcbusirq = 0;
+	lintsrc.destapic = MP_APIC_ALL;
+	for (i = 0; i < 2; i++) {
+		lintsrc.irqtype = linttypes[i];
+		lintsrc.destapiclint = i;
+		MP_lintsrc_info(&lintsrc);
+	}
+}
+
+static struct mpf_intel *mpf_found;
+
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	mpc = early_ioremap(physptr, PAGE_SIZE);
+	size = mpc->length;
+	early_iounmap(mpc, PAGE_SIZE);
+	apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
+
+	return size;
+}
+
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	size = get_mpc_size(mpf->physptr);
+	mpc = early_ioremap(mpf->physptr, size);
+	/*
+	 * Read the physical hardware table.  Anything here will
+	 * override the defaults.
+	 */
+	if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+		smp_found_config = 0;
+#endif
+		printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
+			"... disabling SMP support. (tell your hw vendor)\n");
+		early_iounmap(mpc, size);
+		return -1;
+	}
+	early_iounmap(mpc, size);
+
+	if (early)
+		return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+	/*
+	 * If there are no explicit MP IRQ entries, then we are
+	 * broken.  We set up most of the low 16 IO-APIC pins to
+	 * ISA defaults and hope it will work.
+	 */
+	if (!mp_irq_entries) {
+		struct mpc_bus bus;
+
+		printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+		       "using default mptable. (tell your hw vendor)\n");
+
+		bus.type = MP_BUS;
+		bus.busid = 0;
+		memcpy(bus.bustype, "ISA   ", 6);
+		MP_bus_info(&bus);
+
+		construct_default_ioirq_mptable(0);
+	}
+#endif
+
+	return 0;
+}
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init default_get_smp_config(unsigned int early)
+{
+	struct mpf_intel *mpf = mpf_found;
+
+	if (!mpf)
+		return;
+
+#ifdef CONFIG_XEN
+	BUG_ON(early);
+#define early 0
+#endif
+
+	if (acpi_lapic && early)
+		return;
+
+	/*
+	 * MPS doesn't support hyperthreading, aka only have
+	 * thread 0 apic id in MPS table
+	 */
+	if (acpi_lapic && acpi_ioapic)
+		return;
+
+	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
+	       mpf->specification);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+	if (mpf->feature2 & (1 << 7)) {
+		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+		pic_mode = 1;
+	} else {
+		printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+		pic_mode = 0;
+	}
+#endif
+	/*
+	 * Now see if we need to read further.
+	 */
+	if (mpf->feature1 != 0) {
+#ifndef CONFIG_XEN
+		if (early) {
+			/*
+			 * local APIC has default address
+			 */
+			mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+			return;
+		}
+#endif
+
+		printk(KERN_INFO "Default MP configuration #%d\n",
+		       mpf->feature1);
+		construct_default_ISA_mptable(mpf->feature1);
+
+	} else if (mpf->physptr) {
+		if (check_physptr(mpf, early))
+			return;
+	} else
+		BUG();
+
+	if (!early)
+		printk(KERN_INFO "Processors: %d\n", num_processors);
+	/*
+	 * Only use the first configuration found.
+	 */
+#undef early
+}
+
+#ifndef CONFIG_XEN
+static void __init smp_reserve_memory(struct mpf_intel *mpf)
+{
+	memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
+}
+#endif
+
+static int __init smp_scan_config(unsigned long base, unsigned long length)
+{
+	unsigned int *bp = _bus_to_virt(base);
+	struct mpf_intel *mpf;
+#ifndef CONFIG_XEN
+	unsigned long mem;
+#endif
+
+	apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
+			bp, length);
+	BUILD_BUG_ON(sizeof(*mpf) != 16);
+
+	while (length > 0) {
+		mpf = (struct mpf_intel *)bp;
+		if ((*bp == SMP_MAGIC_IDENT) &&
+		    (mpf->length == 1) &&
+		    !mpf_checksum((unsigned char *)bp, 16) &&
+		    ((mpf->specification == 1)
+		     || (mpf->specification == 4))) {
+#ifdef CONFIG_X86_LOCAL_APIC
+			smp_found_config = 1;
+#endif
+			mpf_found = mpf;
+
+#ifndef CONFIG_XEN
+			printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
+			       mpf, (u64)virt_to_phys(mpf));
+
+			mem = virt_to_phys(mpf);
+			memblock_reserve(mem, sizeof(*mpf));
+			if (mpf->physptr)
+				smp_reserve_memory(mpf);
+#else
+			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+			       mpf, ((void *)bp - _bus_to_virt(base)) + base);
+#endif
+			return 1;
+		}
+		bp += 4;
+		length -= 16;
+	}
+	return 0;
+}
+
+void __init default_find_smp_config(void)
+{
+#ifndef CONFIG_XEN
+	unsigned int address;
+#endif
+
+	/*
+	 * FIXME: Linux assumes you have 640K of base ram..
+	 * this continues the error...
+	 *
+	 * 1) Scan the bottom 1K for a signature
+	 * 2) Scan the top 1K of base RAM
+	 * 3) Scan the 64K of bios
+	 */
+	if (smp_scan_config(0x0, 0x400) ||
+	    smp_scan_config(639 * 0x400, 0x400) ||
+	    smp_scan_config(0xF0000, 0x10000))
+		return;
+	/*
+	 * If it is an SMP machine we should know now, unless the
+	 * configuration is in an EISA/MCA bus machine with an
+	 * extended bios data area.
+	 *
+	 * there is a real-mode segmented pointer pointing to the
+	 * 4K EBDA area at 0x40E, calculate and scan it here.
+	 *
+	 * NOTE! There are Linux loaders that will corrupt the EBDA
+	 * area, and as such this kind of SMP config may be less
+	 * trustworthy, simply because the SMP table may have been
+	 * stomped on during early boot. These loaders are buggy and
+	 * should be fixed.
+	 *
+	 * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+	 */
+
+#ifndef CONFIG_XEN
+	address = get_bios_ebda();
+	if (address)
+		smp_scan_config(address, 0x400);
+#endif
+}
+
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
+
+static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)
+{
+	int i;
+
+	if (m->irqtype != mp_INT)
+		return 0;
+
+	if (m->irqflag != 0x0f)
+		return 0;
+
+	/* not legacy */
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (mp_irqs[i].irqtype != mp_INT)
+			continue;
+
+		if (mp_irqs[i].irqflag != 0x0f)
+			continue;
+
+		if (mp_irqs[i].srcbus != m->srcbus)
+			continue;
+		if (mp_irqs[i].srcbusirq != m->srcbusirq)
+			continue;
+		if (irq_used[i]) {
+			/* already claimed */
+			return -2;
+		}
+		irq_used[i] = 1;
+		return i;
+	}
+
+	/* not found */
+	return -1;
+}
+
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
+
+static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+	int i;
+
+	apic_printk(APIC_VERBOSE, "OLD ");
+	print_mp_irq_info(m);
+
+	i = get_MP_intsrc_index(m);
+	if (i > 0) {
+		memcpy(m, &mp_irqs[i], sizeof(*m));
+		apic_printk(APIC_VERBOSE, "NEW ");
+		print_mp_irq_info(&mp_irqs[i]);
+		return;
+	}
+	if (!i) {
+		/* legacy, do nothing */
+		return;
+	}
+	if (*nr_m_spare < SPARE_SLOT_NUM) {
+		/*
+		 * not found (-1), or duplicated (-2) are invalid entries,
+		 * we need to use the slot later
+		 */
+		m_spare[*nr_m_spare] = m;
+		*nr_m_spare += 1;
+	}
+}
+
+static int __init
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
+{
+	if (!mpc_new_phys || count <= mpc_new_length) {
+		WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+		return -1;
+	}
+
+	return 0;
+}
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int  __init replace_intsrc_all(struct mpc_table *mpc,
+					unsigned long mpc_new_phys,
+					unsigned long mpc_new_length)
+{
+#ifdef CONFIG_X86_IO_APIC
+	int i;
+#endif
+	int count = sizeof(*mpc);
+	int nr_m_spare = 0;
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+	printk(KERN_INFO "mpc_length %x\n", mpc->length);
+	while (count < mpc->length) {
+		switch (*mpt) {
+		case MP_PROCESSOR:
+			skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+			break;
+		case MP_BUS:
+			skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+			break;
+		case MP_IOAPIC:
+			skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+			break;
+		case MP_INTSRC:
+			check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+			skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+			break;
+		case MP_LINTSRC:
+			skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+			break;
+		default:
+			/* wrong mptable */
+			smp_dump_mptable(mpc, mpt);
+			goto out;
+		}
+	}
+
+#ifdef CONFIG_X86_IO_APIC
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (irq_used[i])
+			continue;
+
+		if (mp_irqs[i].irqtype != mp_INT)
+			continue;
+
+		if (mp_irqs[i].irqflag != 0x0f)
+			continue;
+
+		if (nr_m_spare > 0) {
+			apic_printk(APIC_VERBOSE, "*NEW* found\n");
+			nr_m_spare--;
+			memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
+			m_spare[nr_m_spare] = NULL;
+		} else {
+			struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
+			count += sizeof(struct mpc_intsrc);
+			if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
+				goto out;
+			memcpy(m, &mp_irqs[i], sizeof(*m));
+			mpc->length = count;
+			mpt += sizeof(struct mpc_intsrc);
+		}
+		print_mp_irq_info(&mp_irqs[i]);
+	}
+#endif
+out:
+	/* update checksum */
+	mpc->checksum = 0;
+	mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length);
+
+	return 0;
+}
+
+int enable_update_mptable;
+
+static int __init update_mptable_setup(char *str)
+{
+	enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+	pci_routeirq = 1;
+#endif
+	return 0;
+}
+early_param("update_mptable", update_mptable_setup);
+
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
+
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+	enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+	pci_routeirq = 1;
+#endif
+	alloc_mptable = 1;
+	if (!p)
+		return 0;
+	mpc_new_length = memparse(p, &p);
+	return 0;
+}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
+
+void __init early_reserve_e820_mpc_new(void)
+{
+	if (enable_update_mptable && alloc_mptable)
+		mpc_new_phys = early_reserve_e820(mpc_new_length, 4);
+}
+
+static int __init update_mp_table(void)
+{
+	char str[16];
+	char oem[10];
+	struct mpf_intel *mpf;
+	struct mpc_table *mpc, *mpc_new;
+
+	if (!enable_update_mptable)
+		return 0;
+
+	mpf = mpf_found;
+	if (!mpf)
+		return 0;
+
+	/*
+	 * Now see if we need to go further.
+	 */
+	if (mpf->feature1 != 0)
+		return 0;
+
+	if (!mpf->physptr)
+		return 0;
+
+	mpc = _bus_to_virt(mpf->physptr);
+
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;
+
+	printk(KERN_INFO "mpf: %llx\n", (u64)arbitrary_virt_to_machine(mpf));
+	printk(KERN_INFO "physptr: %x\n", mpf->physptr);
+
+	if (mpc_new_phys && mpc->length > mpc_new_length) {
+		mpc_new_phys = 0;
+		printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
+			 mpc_new_length);
+	}
+
+	if (!mpc_new_phys) {
+		unsigned char old, new;
+		/* check if we can change the position */
+		mpc->checksum = 0;
+		old = mpf_checksum((unsigned char *)mpc, mpc->length);
+		mpc->checksum = 0xff;
+		new = mpf_checksum((unsigned char *)mpc, mpc->length);
+		if (old == new) {
+			printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+			return 0;
+		}
+		printk(KERN_INFO "use in-position replacing\n");
+	} else {
+		maddr_t mpc_new_bus;
+
+		mpc_new_bus = phys_to_machine(mpc_new_phys);
+		mpf->physptr = mpc_new_bus;
+		mpc_new = phys_to_virt(mpc_new_phys);
+		memcpy(mpc_new, mpc, mpc->length);
+		mpc = mpc_new;
+		/* check if we can modify that */
+		if (mpc_new_bus - mpf->physptr) {
+			struct mpf_intel *mpf_new;
+			/* steal 16 bytes from [0, 1k) */
+			printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+			mpf_new = isa_bus_to_virt(0x400 - 16);
+			memcpy(mpf_new, mpf, 16);
+			mpf = mpf_new;
+			mpf->physptr = mpc_new_bus;
+		}
+		mpf->checksum = 0;
+		mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
+		printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
+	}
+
+	/*
+	 * only replace the one with mp_INT and
+	 *	 MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+	 * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+	 * may need pci=routeirq for all coverage
+	 */
+	replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+	return 0;
+}
+
+late_initcall(update_mp_table);
diff --git a/arch/x86/kernel/msr-xen.c b/arch/x86/kernel/msr-xen.c
new file mode 100644
index 0000000..900a894
--- /dev/null
+++ b/arch/x86/kernel/msr-xen.c
@@ -0,0 +1,337 @@
+#ifndef CONFIG_XEN_PRIVILEGED_GUEST
+#include "msr.c"
+#else
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 2010 Novell, Inc.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * x86 MSR access device
+ *
+ * This device is accessed by lseek() to the appropriate register number
+ * and then read/write in chunks of 8 bytes.  A larger size means multiple
+ * reads or writes of the same register.
+ *
+ * This driver uses /dev/xen/cpu/%d/msr where %d correlates to the minor
+ * number, and on an SMP box will direct the access to pCPU %d.
+ */
+
+static int msr_init(void);
+static void msr_exit(void);
+
+#define msr_init(args...) _msr_init(args)
+#define msr_exit(args...) _msr_exit(args)
+#include "msr.c"
+#undef msr_exit
+#undef msr_init
+
+#include <linux/slab.h>
+#include <xen/pcpu.h>
+
+static struct class *pmsr_class;
+static unsigned int minor_bias = 10;
+static unsigned int nr_xen_cpu_ids;
+static unsigned long *xen_cpu_online_map;
+
+#define PMSR_DEV(cpu) MKDEV(MSR_MAJOR, (cpu) + minor_bias)
+
+static unsigned int pmsr_minor(struct inode *inode)
+{
+	return iminor(inode) - minor_bias;
+}
+
+static ssize_t pmsr_read(struct file *file, char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	u32 __user *tmp = (u32 __user *) buf;
+	u32 data[2];
+	u32 reg = *ppos;
+	unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode);
+	int err = 0;
+	ssize_t bytes = 0;
+
+	if (count % 8)
+		return -EINVAL;	/* Invalid chunk size */
+
+	for (; count; count -= 8) {
+		err = rdmsr_safe_on_pcpu(cpu, reg, &data[0], &data[1]);
+		if (err)
+			break;
+		if (copy_to_user(tmp, &data, 8)) {
+			err = -EFAULT;
+			break;
+		}
+		tmp += 2;
+		bytes += 8;
+	}
+
+	return bytes ? bytes : err;
+}
+
+static ssize_t pmsr_write(struct file *file, const char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	const u32 __user *tmp = (const u32 __user *)buf;
+	u32 data[2];
+	u32 reg = *ppos;
+	unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode);
+	int err = 0;
+	ssize_t bytes = 0;
+
+	if (count % 8)
+		return -EINVAL;	/* Invalid chunk size */
+
+	for (; count; count -= 8) {
+		if (copy_from_user(&data, tmp, 8)) {
+			err = -EFAULT;
+			break;
+		}
+		err = wrmsr_safe_on_pcpu(cpu, reg, data[0], data[1]);
+		if (err)
+			break;
+		tmp += 2;
+		bytes += 8;
+	}
+
+	return bytes ? bytes : err;
+}
+
+static long pmsr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
+{
+	u32 __user *uregs = (u32 __user *)arg;
+	u32 regs[8];
+	unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode);
+	int err;
+
+	switch (ioc) {
+	case X86_IOC_RDMSR_REGS:
+		if (!(file->f_mode & FMODE_READ)) {
+			err = -EBADF;
+			break;
+		}
+		if (copy_from_user(&regs, uregs, sizeof regs)) {
+			err = -EFAULT;
+			break;
+		}
+		err = rdmsr_safe_regs_on_pcpu(cpu, regs);
+		if (err)
+			break;
+		if (copy_to_user(uregs, &regs, sizeof regs))
+			err = -EFAULT;
+		break;
+
+	case X86_IOC_WRMSR_REGS:
+		if (!(file->f_mode & FMODE_WRITE)) {
+			err = -EBADF;
+			break;
+		}
+		if (copy_from_user(&regs, uregs, sizeof regs)) {
+			err = -EFAULT;
+			break;
+		}
+		err = wrmsr_safe_regs_on_pcpu(cpu, regs);
+		if (err)
+			break;
+		if (copy_to_user(uregs, &regs, sizeof regs))
+			err = -EFAULT;
+		break;
+
+	default:
+		err = -ENOTTY;
+		break;
+	}
+
+	return err;
+}
+
+static int pmsr_open(struct inode *inode, struct file *file)
+{
+	unsigned int cpu;
+
+	cpu = pmsr_minor(file->f_path.dentry->d_inode);
+	if (cpu >= nr_xen_cpu_ids || !test_bit(cpu, xen_cpu_online_map))
+		return -ENXIO;	/* No such CPU */
+
+	return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations pmsr_fops = {
+	.owner = THIS_MODULE,
+	.llseek = msr_seek,
+	.read = pmsr_read,
+	.write = pmsr_write,
+	.open = pmsr_open,
+	.unlocked_ioctl = pmsr_ioctl,
+	.compat_ioctl = pmsr_ioctl,
+};
+
+static int pmsr_device_create(unsigned int cpu)
+{
+	struct device *dev;
+
+	if (cpu >= nr_xen_cpu_ids) {
+		static bool warned;
+		unsigned long *map;
+
+		if ((minor_bias + cpu) >> MINORBITS) {
+			if (!warned) {
+				warned = true;
+				pr_warn("Physical MSRs of CPUs beyond %u"
+					" will not be accessible\n",
+					MINORMASK - minor_bias);
+			}
+			return -EDOM;
+		}
+
+		map = kcalloc(BITS_TO_LONGS(cpu + 1), sizeof(*map),
+			      GFP_KERNEL);
+		if (!map) {
+			if (!warned) {
+				warned = true;
+				pr_warn("Physical MSRs of CPUs beyond %u"
+					" may not be accessible\n",
+					nr_xen_cpu_ids - 1);
+			}
+			return -ENOMEM;
+		}
+
+		memcpy(map, xen_cpu_online_map,
+		       BITS_TO_LONGS(nr_xen_cpu_ids)
+		       * sizeof(*xen_cpu_online_map));
+		nr_xen_cpu_ids = min_t(unsigned int,
+				     BITS_TO_LONGS(cpu + 1) * BITS_PER_LONG,
+				     MINORMASK + 1 - minor_bias);
+		kfree(xchg(&xen_cpu_online_map, map));
+	}
+	set_bit(cpu, xen_cpu_online_map);
+	dev = device_create(pmsr_class, NULL, PMSR_DEV(cpu), NULL,
+			    "pmsr%d", cpu);
+	return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+}
+
+static void pmsr_device_destroy(unsigned int cpu)
+{
+	clear_bit(cpu, xen_cpu_online_map);
+	device_destroy(pmsr_class, PMSR_DEV(cpu));
+}
+
+static int pmsr_cpu_callback(struct notifier_block *nfb,
+			     unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+		pmsr_device_create(cpu);
+		break;
+	case CPU_DEAD:
+		pmsr_device_destroy(cpu);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block pmsr_cpu_notifier = {
+	.notifier_call = pmsr_cpu_callback,
+};
+
+static char *pmsr_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "xen/cpu/%u/msr",
+			 MINOR(dev->devt) - minor_bias);
+}
+
+static int __init msr_init(void)
+{
+	int err;
+	xen_platform_op_t op;
+
+	err = _msr_init();
+	if (err || !is_initial_xendomain())
+		return err;
+
+	op.cmd = XENPF_get_cpuinfo;
+	op.u.pcpu_info.xen_cpuid = 0;
+	do {
+		err = HYPERVISOR_platform_op(&op);
+	} while (err == -EBUSY);
+	if (err)
+		goto out;
+	nr_xen_cpu_ids = BITS_TO_LONGS(op.u.pcpu_info.max_present + 1)
+			 * BITS_PER_LONG;
+
+	while (minor_bias < NR_CPUS)
+		minor_bias *= 10;
+	if ((minor_bias + nr_xen_cpu_ids - 1) >> MINORBITS)
+		minor_bias = NR_CPUS;
+	if ((minor_bias + nr_xen_cpu_ids - 1) >> MINORBITS)
+		nr_xen_cpu_ids = MINORMASK + 1 - NR_CPUS;
+
+	xen_cpu_online_map = kcalloc(BITS_TO_LONGS(nr_xen_cpu_ids),
+				     sizeof(*xen_cpu_online_map),
+				     GFP_KERNEL);
+	if (!xen_cpu_online_map) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (__register_chrdev(MSR_MAJOR, minor_bias,
+			      MINORMASK + 1 - minor_bias,
+			      "pcpu/msr", &pmsr_fops)) {
+		pr_err("msr: unable to get minors for pmsr\n");
+		goto out;
+	}
+	pmsr_class = class_create(THIS_MODULE, "pmsr");
+	if (IS_ERR(pmsr_class)) {
+		err = PTR_ERR(pmsr_class);
+		goto out_chrdev;
+	}
+	pmsr_class->devnode = pmsr_devnode;
+	err = register_pcpu_notifier(&pmsr_cpu_notifier);
+
+	if (!err && !nr_xen_cpu_ids)
+		err = -ENODEV;
+	if (!err)
+		return 0;
+
+	class_destroy(pmsr_class);
+
+out_chrdev:
+	__unregister_chrdev(MSR_MAJOR, minor_bias,
+			    MINORMASK + 1 - minor_bias, "pcpu/msr");
+out:
+	if (err)
+		pr_warn("msr: can't initialize physical MSR access (%d)\n",
+			err);
+	nr_xen_cpu_ids = 0;
+	kfree(xen_cpu_online_map);
+	return 0;
+}
+
+static void __exit msr_exit(void)
+{
+	if (nr_xen_cpu_ids) {
+		unsigned int cpu = 0;
+
+		unregister_pcpu_notifier(&pmsr_cpu_notifier);
+		for_each_set_bit(cpu, xen_cpu_online_map, nr_xen_cpu_ids)
+			pmsr_device_destroy(cpu);
+		class_destroy(pmsr_class);
+		__unregister_chrdev(MSR_MAJOR, minor_bias,
+				    MINORMASK + 1 - minor_bias, "pcpu/msr");
+		kfree(xen_cpu_online_map);
+	}
+	_msr_exit();
+}
+#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47acaf3..0509542 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -232,15 +232,12 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
 	pr_emerg("Dazed and confused, but trying to continue\n");
 
 	/* Clear and disable the PCI SERR error line. */
-	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
-	outb(reason, NMI_REASON_PORT);
+	clear_serr_error(reason);
 }
 
 static notrace __kprobes void
 io_check_error(unsigned char reason, struct pt_regs *regs)
 {
-	unsigned long i;
-
 	pr_emerg(
 	"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
 		 reason, smp_processor_id());
@@ -250,17 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 		panic("NMI IOCK error: Not continuing");
 
 	/* Re-enable the IOCK line, wait for a few seconds */
-	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
-	outb(reason, NMI_REASON_PORT);
-
-	i = 20000;
-	while (--i) {
-		touch_nmi_watchdog();
-		udelay(100);
-	}
-
-	reason &= ~NMI_REASON_CLEAR_IOCHK;
-	outb(reason, NMI_REASON_PORT);
+	clear_io_check_error(reason);
 }
 
 static notrace __kprobes void
diff --git a/arch/x86/kernel/pci-dma-xen.c b/arch/x86/kernel/pci-dma-xen.c
new file mode 100644
index 0000000..14b72eb
--- /dev/null
+++ b/arch/x86/kernel/pci-dma-xen.c
@@ -0,0 +1,366 @@
+#include <linux/dma-mapping.h>
+#include <linux/dma-debug.h>
+#include <linux/dmar.h>
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/gfp.h>
+#include <linux/pci.h>
+#include <linux/kmemleak.h>
+
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/calgary.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+static int forbid_dac __read_mostly;
+
+struct dma_map_ops *dma_ops = &nommu_dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+static int iommu_sac_force __read_mostly;
+
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __initdata = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __initdata = 0;
+#endif
+
+int iommu_merge __initdata;
+
+int no_iommu __initdata;
+#ifndef CONFIG_XEN
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
+/*
+ * This variable becomes 1 if iommu=pt is passed on the kernel command line.
+ * If this variable is 1, IOMMU implementations do no DMA translation for
+ * devices and allow every device to access to whole physical memory. This is
+ * useful if a user wants to use an IOMMU only for KVM device assignment to
+ * guests and not for driver dma translation.
+ */
+int iommu_pass_through __read_mostly;
+
+/*
+ * Group multi-function PCI devices into a single device-group for the
+ * iommu_device_group interface.  This tells the iommu driver to pretend
+ * it cannot distinguish between functions of a device, exposing only one
+ * group for the device.  Useful for disallowing use of individual PCI
+ * functions from userspace drivers.
+ */
+int iommu_group_mf __read_mostly;
+#endif
+
+extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
+
+/* Dummy device used for NULL arguments (normally ISA). */
+struct device x86_dma_fallback_dev = {
+	.init_name = "fallback device",
+	.coherent_dma_mask = ISA_DMA_BIT_MASK,
+	.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
+};
+EXPORT_SYMBOL(x86_dma_fallback_dev);
+
+/* Number of entries preallocated for DMA-API debugging */
+#define PREALLOC_DMA_DEBUG_ENTRIES       32768
+
+int dma_set_mask(struct device *dev, u64 mask)
+{
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+
+	*dev->dma_mask = mask;
+
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+static struct dma_map_ops swiotlb_dma_ops = {
+	.alloc_coherent = dma_generic_alloc_coherent,
+	.free_coherent = dma_generic_free_coherent,
+	.mapping_error = swiotlb_dma_mapping_error,
+	.map_page = swiotlb_map_page,
+	.unmap_page = swiotlb_unmap_page,
+	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = swiotlb_sync_sg_for_device,
+	.map_sg = swiotlb_map_sg_attrs,
+	.unmap_sg = swiotlb_unmap_sg_attrs,
+	.dma_supported = swiotlb_dma_supported
+};
+
+static int __init pci_xen_swiotlb_detect(void)
+{
+	return 1;
+}
+
+static void __init pci_xen_swiotlb_init(void)
+{
+	swiotlb_init(1);
+	if (swiotlb) {
+		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+		dma_ops = &swiotlb_dma_ops;
+	}
+}
+
+IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, NULL, pci_xen_swiotlb_init, NULL);
+
+void __init pci_iommu_alloc(void)
+{
+	struct iommu_table_entry *p;
+
+	sort_iommu_table(__iommu_table, __iommu_table_end);
+	check_iommu_entries(__iommu_table, __iommu_table_end);
+
+	for (p = __iommu_table; p < __iommu_table_end; p++) {
+		if (p && p->detect && p->detect() > 0) {
+			p->flags |= IOMMU_DETECTED;
+			if (p->early_init)
+				p->early_init();
+			if (p->flags & IOMMU_FINISH_IF_DETECTED)
+				break;
+		}
+	}
+}
+void *dma_generic_alloc_coherent(struct device *dev, size_t size,
+				 dma_addr_t *dma_addr, gfp_t flag)
+{
+	unsigned long dma_mask;
+	struct page *page;
+#ifndef CONFIG_XEN
+	dma_addr_t addr;
+#else
+	void *memory;
+#endif
+	unsigned int order = get_order(size);
+
+	dma_mask = dma_alloc_coherent_mask(dev, flag);
+
+#ifndef CONFIG_XEN
+	flag |= __GFP_ZERO;
+again:
+#else
+	flag &= ~(__GFP_DMA | __GFP_DMA32);
+#endif
+	page = alloc_pages_node(dev_to_node(dev), flag, order);
+	if (!page)
+		return NULL;
+
+#ifndef CONFIG_XEN
+	addr = page_to_phys(page);
+	if (addr + size > dma_mask) {
+		__free_pages(page, order);
+
+		if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
+			flag = (flag & ~GFP_DMA32) | GFP_DMA;
+			goto again;
+		}
+
+		return NULL;
+	}
+
+	*dma_addr = addr;
+	return page_address(page);
+#else
+	memory = page_address(page);
+	if (xen_create_contiguous_region((unsigned long)memory, order,
+					 fls64(dma_mask))) {
+		__free_pages(page, order);
+		return NULL;
+	}
+
+	*dma_addr = virt_to_bus(memory);
+	return memset(memory, 0, size);
+#endif
+}
+
+#ifdef CONFIG_XEN
+void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
+			       dma_addr_t dma_addr)
+{
+	unsigned int order = get_order(size);
+	unsigned long va = (unsigned long)vaddr;
+
+	xen_destroy_contiguous_region(va, order);
+	free_pages(va, order);
+}
+#endif
+
+/*
+ * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
+ * parameter documentation.
+ */
+static __init int iommu_setup(char *p)
+{
+	iommu_merge = 1;
+
+	if (!p)
+		return -EINVAL;
+
+	while (*p) {
+		if (!strncmp(p, "off", 3))
+			no_iommu = 1;
+		/* gart_parse_options has more force support */
+		if (!strncmp(p, "force", 5))
+			force_iommu = 1;
+		if (!strncmp(p, "noforce", 7)) {
+			iommu_merge = 0;
+			force_iommu = 0;
+		}
+
+		if (!strncmp(p, "biomerge", 8)) {
+			iommu_merge = 1;
+			force_iommu = 1;
+		}
+		if (!strncmp(p, "panic", 5))
+			panic_on_overflow = 1;
+		if (!strncmp(p, "nopanic", 7))
+			panic_on_overflow = 0;
+		if (!strncmp(p, "merge", 5)) {
+			iommu_merge = 1;
+			force_iommu = 1;
+		}
+		if (!strncmp(p, "nomerge", 7))
+			iommu_merge = 0;
+		if (!strncmp(p, "forcesac", 8))
+			iommu_sac_force = 1;
+		if (!strncmp(p, "allowdac", 8))
+			forbid_dac = 0;
+		if (!strncmp(p, "nodac", 5))
+			forbid_dac = 1;
+		if (!strncmp(p, "usedac", 6)) {
+			forbid_dac = -1;
+			return 1;
+		}
+#ifdef CONFIG_SWIOTLB
+		if (!strncmp(p, "soft", 4))
+			swiotlb = 1;
+#endif
+#ifndef CONFIG_XEN
+		if (!strncmp(p, "pt", 2))
+			iommu_pass_through = 1;
+		if (!strncmp(p, "group_mf", 8))
+			iommu_group_mf = 1;
+#endif
+
+		gart_parse_options(p);
+
+#ifdef CONFIG_CALGARY_IOMMU
+		if (!strncmp(p, "calgary", 7))
+			use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
+		p += strcspn(p, ",");
+		if (*p == ',')
+			++p;
+	}
+	return 0;
+}
+early_param("iommu", iommu_setup);
+
+static int check_pages_physically_contiguous(unsigned long pfn,
+					     unsigned int offset,
+					     size_t length)
+{
+	unsigned long next_mfn;
+	int i;
+	int nr_pages;
+
+	next_mfn = pfn_to_mfn(pfn);
+	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+	for (i = 1; i < nr_pages; i++) {
+		if (pfn_to_mfn(++pfn) != ++next_mfn)
+			return 0;
+	}
+	return 1;
+}
+
+int range_straddles_page_boundary(paddr_t p, size_t size)
+{
+	unsigned long pfn = p >> PAGE_SHIFT;
+	unsigned int offset = p & ~PAGE_MASK;
+
+	return ((offset + size > PAGE_SIZE) &&
+		!check_pages_physically_contiguous(pfn, offset, size));
+}
+
+int dma_supported(struct device *dev, u64 mask)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+#ifdef CONFIG_PCI
+	if (mask > 0xffffffff && forbid_dac > 0) {
+		dev_info(dev, "PCI: Disallowing DAC for device\n");
+		return 0;
+	}
+#endif
+
+	if (ops->dma_supported)
+		return ops->dma_supported(dev, mask);
+
+	/* Copied from i386. Doesn't make much sense, because it will
+	   only work for pci_alloc_coherent.
+	   The caller just has to use GFP_DMA in this case. */
+	if (mask < DMA_BIT_MASK(24))
+		return 0;
+
+	/* Tell the device to use SAC when IOMMU force is on.  This
+	   allows the driver to use cheaper accesses in some cases.
+
+	   Problem with this is that if we overflow the IOMMU area and
+	   return DAC as fallback address the device may not handle it
+	   correctly.
+
+	   As a special case some controllers have a 39bit address
+	   mode that is as efficient as 32bit (aic79xx). Don't force
+	   SAC for these.  Assume all masks <= 40 bits are of this
+	   type. Normally this doesn't make any difference, but gives
+	   more gentle handling of IOMMU overflow. */
+	if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
+		dev_info(dev, "Force SAC with mask %Lx\n", mask);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
+static int __init pci_iommu_init(void)
+{
+	struct iommu_table_entry *p;
+	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+
+#ifdef CONFIG_PCI
+	dma_debug_add_bus(&pci_bus_type);
+#endif
+	x86_init.iommu.iommu_init();
+
+	for (p = __iommu_table; p < __iommu_table_end; p++) {
+		if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
+			p->late_init();
+	}
+
+	return 0;
+}
+/* Must execute after PCI subsystem */
+rootfs_initcall(pci_iommu_init);
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
+		dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
+		forbid_dac = 1;
+	}
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
+#endif
diff --git a/arch/x86/kernel/pci-nommu-xen.c b/arch/x86/kernel/pci-nommu-xen.c
new file mode 100644
index 0000000..9dc9d8e
--- /dev/null
+++ b/arch/x86/kernel/pci-nommu-xen.c
@@ -0,0 +1,114 @@
+#include <linux/dma-mapping.h>
+#include <linux/dmar.h>
+#include <linux/bootmem.h>
+#include <linux/pci.h>
+
+#include <xen/gnttab.h>
+
+#include <asm/iommu.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/swiotlb.h>
+#include <asm/tlbflush.h>
+#include <asm/gnttab_dma.h>
+#include <asm/bug.h>
+
+#define IOMMU_BUG_ON(test)				\
+do {							\
+	if (unlikely(test)) {				\
+		printk(KERN_ALERT "Fatal DMA error! "	\
+		       "Please use 'swiotlb=force'\n");	\
+		BUG();					\
+	}						\
+} while (0)
+
+static int
+gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+	      enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	unsigned int i;
+	struct scatterlist *sg;
+
+	WARN_ON(nents == 0 || sgl->length == 0);
+
+	for_each_sg(sgl, sg, nents, i) {
+		BUG_ON(!sg_page(sg));
+		sg->dma_address =
+			gnttab_dma_map_page(sg_page(sg)) + sg->offset;
+		sg->dma_length  = sg->length;
+		IOMMU_BUG_ON(!dma_capable(
+			hwdev, sg->dma_address, sg->length));
+		IOMMU_BUG_ON(range_straddles_page_boundary(
+			page_to_pseudophys(sg_page(sg)) + sg->offset,
+			sg->length));
+	}
+
+	return nents;
+}
+
+static void
+gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+		enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	unsigned int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sgl, sg, nents, i)
+		gnttab_dma_unmap_page(sg->dma_address);
+}
+
+static dma_addr_t
+gnttab_map_page(struct device *dev, struct page *page, unsigned long offset,
+		size_t size, enum dma_data_direction dir,
+		struct dma_attrs *attrs)
+{
+	dma_addr_t dma;
+
+	WARN_ON(size == 0);
+
+	dma = gnttab_dma_map_page(page) + offset;
+	IOMMU_BUG_ON(range_straddles_page_boundary(page_to_pseudophys(page) +
+						   offset, size));
+	IOMMU_BUG_ON(!dma_capable(dev, dma, size));
+
+	return dma;
+}
+
+static void
+gnttab_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
+		  enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	gnttab_dma_unmap_page(dma_addr);
+}
+
+static void nommu_sync_single_for_device(struct device *dev,
+			dma_addr_t addr, size_t size,
+			enum dma_data_direction dir)
+{
+	flush_write_buffers();
+}
+
+
+static void nommu_sync_sg_for_device(struct device *dev,
+			struct scatterlist *sg, int nelems,
+			enum dma_data_direction dir)
+{
+	flush_write_buffers();
+}
+
+static int nommu_dma_supported(struct device *hwdev, u64 mask)
+{
+	return 1;
+}
+
+struct dma_map_ops nommu_dma_ops = {
+	.alloc_coherent		= dma_generic_alloc_coherent,
+	.free_coherent		= dma_generic_free_coherent,
+	.map_page		= gnttab_map_page,
+	.unmap_page		= gnttab_unmap_page,
+	.map_sg			= gnttab_map_sg,
+	.unmap_sg		= gnttab_unmap_sg,
+	.sync_single_for_device = nommu_sync_single_for_device,
+	.sync_sg_for_device	= nommu_sync_sg_for_device,
+	.dma_supported		= nommu_dma_supported,
+};
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index a311ffc..965c549 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -6,6 +6,11 @@ static __init int add_pcspkr(void)
 {
 	struct platform_device *pd;
 
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return 0;
+#endif
+
 	pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
 
 	return IS_ERR(pd) ? PTR_ERR(pd) : 0;
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 34e06e8..5fb1971 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -114,6 +114,11 @@ static struct resource *find_oprom(struct pci_dev *pdev)
 	struct resource *oprom = NULL;
 	int i;
 
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return NULL;
+#endif
+
 	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
 		struct resource *res = &adapter_rom_resources[i];
 		unsigned short offset, vendor, device, list, rev;
@@ -232,7 +237,7 @@ void __init probe_roms(void)
 	upper = system_rom_resource.start;
 
 	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt(extension_rom_resource.start);
+	rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
 	if (romsignature(rom)) {
 		length = resource_size(&extension_rom_resource);
 		if (romchecksum(rom, length)) {
diff --git a/arch/x86/kernel/process-xen.c b/arch/x86/kernel/process-xen.c
new file mode 100644
index 0000000..521d602
--- /dev/null
+++ b/arch/x86/kernel/process-xen.c
@@ -0,0 +1,630 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/prctl.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+#include <linux/clockchips.h>
+#include <linux/random.h>
+#include <linux/user-return-notifier.h>
+#include <linux/dmi.h>
+#include <linux/utsname.h>
+#include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
+#include <asm/cpu.h>
+#include <asm/system.h>
+#include <asm/apic.h>
+#include <asm/syscalls.h>
+#include <asm/idle.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/debugreg.h>
+#include <xen/evtchn.h>
+
+struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
+
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+	int ret;
+
+	*dst = *src;
+	if (fpu_allocated(&src->thread.fpu)) {
+		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
+		ret = fpu_alloc(&dst->thread.fpu);
+		if (ret)
+			return ret;
+		fpu_copy(&dst->thread.fpu, &src->thread.fpu);
+	}
+	return 0;
+}
+
+void free_thread_xstate(struct task_struct *tsk)
+{
+	fpu_free(&tsk->thread.fpu);
+}
+
+void free_thread_info(struct thread_info *ti)
+{
+	free_thread_xstate(ti->task);
+	free_pages((unsigned long)ti, THREAD_ORDER);
+}
+
+void arch_task_cache_init(void)
+{
+        task_xstate_cachep =
+        	kmem_cache_create("task_xstate", xstate_size,
+				  __alignof__(union thread_xstate),
+				  SLAB_PANIC | SLAB_NOTRACK, NULL);
+}
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+	struct task_struct *me = current;
+	struct thread_struct *t = &me->thread;
+	unsigned long *bp = t->io_bitmap_ptr;
+
+	if (bp) {
+		struct physdev_set_iobitmap set_iobitmap;
+
+		t->io_bitmap_ptr = NULL;
+		clear_thread_flag(TIF_IO_BITMAP);
+		/*
+		 * Careful, clear this in the TSS too:
+		 */
+		memset(&set_iobitmap, 0, sizeof(set_iobitmap));
+		WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+					      &set_iobitmap));
+		t->io_bitmap_max = 0;
+		kfree(bp);
+	}
+}
+
+void show_regs(struct pt_regs *regs)
+{
+	show_registers(regs);
+	show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
+}
+
+void show_regs_common(void)
+{
+	const char *vendor, *product, *board;
+
+	vendor = dmi_get_system_info(DMI_SYS_VENDOR);
+	if (!vendor)
+		vendor = "";
+	product = dmi_get_system_info(DMI_PRODUCT_NAME);
+	if (!product)
+		product = "";
+
+	/* Board Name is optional */
+	board = dmi_get_system_info(DMI_BOARD_NAME);
+
+	printk(KERN_CONT "\n");
+	printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
+		current->pid, current->comm, print_tainted(),
+		init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
+	printk(KERN_CONT " %s %s", vendor, product);
+	if (board)
+		printk(KERN_CONT "/%s", board);
+	printk(KERN_CONT "\n");
+}
+
+void flush_thread(void)
+{
+	struct task_struct *tsk = current;
+
+	flush_ptrace_hw_breakpoint(tsk);
+	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+	/*
+	 * Forget coprocessor state..
+	 */
+	tsk->fpu_counter = 0;
+	clear_fpu(tsk);
+	clear_used_math();
+}
+
+static void hard_disable_TSC(void)
+{
+	write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_disable_TSC();
+	preempt_enable();
+}
+
+static void hard_enable_TSC(void)
+{
+	write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_enable_TSC();
+	preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+	unsigned int val;
+
+	if (test_thread_flag(TIF_NOTSC))
+		val = PR_TSC_SIGSEGV;
+	else
+		val = PR_TSC_ENABLE;
+
+	return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+	if (val == PR_TSC_SIGSEGV)
+		disable_TSC();
+	else if (val == PR_TSC_ENABLE)
+		enable_TSC();
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	struct thread_struct *prev, *next;
+
+	prev = &prev_p->thread;
+	next = &next_p->thread;
+
+	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
+	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+			debugctl |= DEBUGCTLMSR_BTF;
+
+		update_debugctlmsr(debugctl);
+	}
+
+	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+		/* prev and next are different */
+		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+			hard_disable_TSC();
+		else
+			hard_enable_TSC();
+	}
+	propagate_user_return_notify(prev_p, next_p);
+}
+
+int sys_fork(struct pt_regs *regs)
+{
+	return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+int sys_vfork(struct pt_regs *regs)
+{
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
+		       NULL, NULL);
+}
+
+long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+	  void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+{
+	if (!newsp)
+		newsp = regs->sp;
+	return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+
+/*
+ * This gets run with %si containing the
+ * function to call, and %di containing
+ * the "args".
+ */
+extern void kernel_thread_helper(void);
+
+/*
+ * Create a kernel thread
+ */
+int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+	struct pt_regs regs;
+
+	memset(&regs, 0, sizeof(regs));
+
+	regs.si = (unsigned long) fn;
+	regs.di = (unsigned long) arg;
+
+#ifdef CONFIG_X86_32
+	regs.ds = __USER_DS;
+	regs.es = __USER_DS;
+	regs.fs = __KERNEL_PERCPU;
+	regs.gs = __KERNEL_STACK_CANARY;
+#else
+	regs.ss = __KERNEL_DS;
+#endif
+
+	regs.orig_ax = -1;
+	regs.ip = (unsigned long) kernel_thread_helper;
+	regs.cs = __KERNEL_CS | get_kernel_rpl();
+	regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
+
+	/* Ok, create the new process.. */
+	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(kernel_thread);
+
+/*
+ * sys_execve() executes a new program.
+ */
+long sys_execve(const char __user *name,
+		const char __user *const __user *argv,
+		const char __user *const __user *envp, struct pt_regs *regs)
+{
+	long error;
+	char *filename;
+
+	filename = getname(name);
+	error = PTR_ERR(filename);
+	if (IS_ERR(filename))
+		return error;
+	error = do_execve(filename, argv, envp, regs);
+
+#ifdef CONFIG_X86_32
+	if (error == 0) {
+		/* Make sure we don't return using sysenter.. */
+                set_thread_flag(TIF_IRET);
+        }
+#endif
+
+	putname(filename);
+	return error;
+}
+
+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(pm_idle);
+#endif
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void xen_idle(void)
+{
+	trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+	trace_cpu_idle(1, smp_processor_id());
+	current_thread_info()->status &= ~TS_POLLING;
+	/*
+	 * TS_POLLING-cleared state must be visible before we
+	 * test NEED_RESCHED:
+	 */
+	smp_mb();
+
+	if (!need_resched())
+		safe_halt();	/* enables interrupts racelessly */
+	else
+		local_irq_enable();
+	current_thread_info()->status |= TS_POLLING;
+	trace_power_end(smp_processor_id());
+	trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
+bool __init set_pm_idle_to_default(void)
+{
+	bool ret = !!pm_idle;
+
+	pm_idle = xen_idle;
+
+	return ret;
+}
+void stop_this_cpu(void *dummy)
+{
+	local_irq_disable();
+	/*
+	 * Remove this CPU:
+	 */
+	set_cpu_online(smp_processor_id(), false);
+	disable_all_local_evtchn();
+
+	for (;;) {
+		if (hlt_works(smp_processor_id()))
+			halt();
+	}
+}
+
+static void do_nothing(void *unused)
+{
+}
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
+void cpu_idle_wait(void)
+{
+	smp_mb();
+	/* kick all the CPUs so that they exit out of pm_idle */
+	smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+#ifndef CONFIG_XEN
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+	if (!need_resched()) {
+		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+		trace_cpu_idle(1, smp_processor_id());
+		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+			clflush((void *)&current_thread_info()->flags);
+
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (!need_resched())
+			__sti_mwait(0, 0);
+		else
+			local_irq_enable();
+		trace_power_end(smp_processor_id());
+		trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+	} else
+		local_irq_enable();
+}
+#endif
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle(void)
+{
+	trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+	trace_cpu_idle(0, smp_processor_id());
+	local_irq_enable();
+	while (!need_resched())
+		cpu_relax();
+	trace_power_end(smp_processor_id());
+	trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+}
+
+#ifndef CONFIG_XEN
+/*
+ * mwait selection logic:
+ *
+ * It depends on the CPU. For AMD CPUs that support MWAIT this is
+ * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
+ * then depend on a clock divisor and current Pstate of the core. If
+ * all cores of a processor are in halt state (C1) the processor can
+ * enter the C1E (C1 enhanced) state. If mwait is used this will never
+ * happen.
+ *
+ * idle=mwait overrides this decision and forces the usage of mwait.
+ */
+
+#define MWAIT_INFO			0x05
+#define MWAIT_ECX_EXTENDED_INFO		0x01
+#define MWAIT_EDX_C1			0xf0
+
+int mwait_usable(const struct cpuinfo_x86 *c)
+{
+	u32 eax, ebx, ecx, edx;
+
+	if (boot_option_idle_override == IDLE_FORCE_MWAIT)
+		return 1;
+
+	if (c->cpuid_level < MWAIT_INFO)
+		return 0;
+
+	cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
+	/* Check, whether EDX has extended info about MWAIT */
+	if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
+		return 1;
+
+	/*
+	 * edx enumeratios MONITOR/MWAIT extensions. Check, whether
+	 * C1  supports MWAIT
+	 */
+	return (edx & MWAIT_EDX_C1);
+}
+
+bool amd_e400_c1e_detected;
+EXPORT_SYMBOL(amd_e400_c1e_detected);
+
+static cpumask_var_t amd_e400_c1e_mask;
+
+void amd_e400_remove_cpu(int cpu)
+{
+	if (amd_e400_c1e_mask != NULL)
+		cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
+}
+
+/*
+ * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
+ * pending message MSR. If we detect C1E, then we handle it the same
+ * way as C3 power states (local apic timer and TSC stop)
+ */
+static void amd_e400_idle(void)
+{
+	if (need_resched())
+		return;
+
+	if (!amd_e400_c1e_detected) {
+		u32 lo, hi;
+
+		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+
+		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+			amd_e400_c1e_detected = true;
+			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+				mark_tsc_unstable("TSC halt in AMD C1E");
+			printk(KERN_INFO "System has AMD C1E enabled\n");
+		}
+	}
+
+	if (amd_e400_c1e_detected) {
+		int cpu = smp_processor_id();
+
+		if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
+			cpumask_set_cpu(cpu, amd_e400_c1e_mask);
+			/*
+			 * Force broadcast so ACPI can not interfere.
+			 */
+			clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+					   &cpu);
+			printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
+			       cpu);
+		}
+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+
+		default_idle();
+
+		/*
+		 * The switch back from broadcast mode needs to be
+		 * called with interrupts disabled.
+		 */
+		 local_irq_disable();
+		 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+		 local_irq_enable();
+	} else
+		default_idle();
+}
+#endif
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+#ifndef CONFIG_XEN
+#ifdef CONFIG_SMP
+	if (pm_idle == poll_idle && smp_num_siblings > 1) {
+		printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
+			" performance may degrade.\n");
+	}
+#endif
+	if (pm_idle)
+		return;
+
+	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
+		/*
+		 * One CPU supports mwait => All CPUs supports mwait
+		 */
+		printk(KERN_INFO "using mwait in idle threads.\n");
+		pm_idle = mwait_idle;
+	} else if (cpu_has_amd_erratum(amd_erratum_400)) {
+		/* E400: APIC timer interrupt does not wake up CPU from C1e */
+		printk(KERN_INFO "using AMD E400 aware idle routine\n");
+		pm_idle = amd_e400_idle;
+	} else
+		pm_idle = default_idle;
+#endif
+}
+
+void __init init_amd_e400_c1e_mask(void)
+{
+#ifndef CONFIG_XEN
+	/* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
+	if (pm_idle == amd_e400_idle)
+		zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
+#endif
+}
+
+static int __init idle_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "poll")) {
+		printk("using polling idle threads.\n");
+		pm_idle = poll_idle;
+		boot_option_idle_override = IDLE_POLL;
+#ifndef CONFIG_XEN
+	} else if (!strcmp(str, "mwait")) {
+		boot_option_idle_override = IDLE_FORCE_MWAIT;
+		WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
+	} else if (!strcmp(str, "halt")) {
+		/*
+		 * When the boot option of idle=halt is added, halt is
+		 * forced to be used for CPU idle. In such case CPU C2/C3
+		 * won't be used again.
+		 * To continue to load the CPU idle driver, don't touch
+		 * the boot_option_idle_override.
+		 */
+		pm_idle = default_idle;
+		boot_option_idle_override = IDLE_HALT;
+	} else if (!strcmp(str, "nomwait")) {
+		/*
+		 * If the boot option of "idle=nomwait" is added,
+		 * it means that mwait will be disabled for CPU C2/C3
+		 * states. In such case it won't touch the variable
+		 * of boot_option_idle_override.
+		 */
+		boot_option_idle_override = IDLE_NOMWAIT;
+#endif
+	} else
+		return -1;
+
+	return 0;
+}
+early_param("idle", idle_setup);
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+		sp -= get_random_int() % 8192;
+	return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/arch/x86/kernel/process_32-xen.c b/arch/x86/kernel/process_32-xen.c
new file mode 100644
index 0000000..08e8000
--- /dev/null
+++ b/arch/x86/kernel/process_32-xen.c
@@ -0,0 +1,446 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <linux/stackprotector.h>
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/personality.h>
+#include <linux/tick.h>
+#include <linux/percpu.h>
+#include <linux/prctl.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/kdebug.h>
+#include <linux/cpuidle.h>
+
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/desc.h>
+#ifdef CONFIG_MATH_EMULATION
+#include <asm/math_emu.h>
+#endif
+
+#include <xen/interface/physdev.h>
+
+#include <linux/err.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+#include <asm/idle.h>
+#include <asm/syscalls.h>
+#include <asm/debugreg.h>
+#include <asm/nmi.h>
+
+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
+
+/*
+ * Return saved PC of a blocked thread.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+	return ((unsigned long *)tsk->thread.sp)[3];
+}
+
+#ifndef CONFIG_SMP
+static inline void play_dead(void)
+{
+	BUG();
+}
+#endif
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+	int cpu = smp_processor_id();
+
+	/*
+	 * If we're the non-boot CPU, nothing set the stack canary up
+	 * for us.  CPU0 already has it initialized but no harm in
+	 * doing it again.  This is a good place for updating it, as
+	 * we wont ever return from this function (so the invalid
+	 * canaries already on the stack wont ever trigger).
+	 */
+	boot_init_stack_canary();
+
+	current_thread_info()->status |= TS_POLLING;
+
+	/* endless idle loop with no priority at all */
+	while (1) {
+		tick_nohz_idle_enter();
+		rcu_idle_enter();
+		while (!need_resched()) {
+
+			check_pgt_cache();
+			rmb();
+
+			if (cpu_is_offline(cpu))
+				play_dead();
+
+			local_touch_nmi();
+			local_irq_disable();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
+			if (cpuidle_idle_call())
+				xen_idle();
+			start_critical_timings();
+		}
+		rcu_idle_exit();
+		tick_nohz_idle_exit();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+	}
+}
+
+void __show_regs(struct pt_regs *regs, int all)
+{
+	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+	unsigned long d0, d1, d2, d3, d6, d7;
+	unsigned long sp;
+	unsigned short ss, gs;
+
+	if (user_mode_vm(regs)) {
+		sp = regs->sp;
+		ss = regs->ss & 0xffff;
+		gs = get_user_gs(regs);
+	} else {
+		sp = kernel_stack_pointer(regs);
+		savesegment(ss, ss);
+		savesegment(gs, gs);
+	}
+
+	show_regs_common();
+
+	printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
+			(u16)regs->cs, regs->ip, regs->flags,
+			smp_processor_id());
+	print_symbol("EIP is at %s\n", regs->ip);
+
+	printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+		regs->ax, regs->bx, regs->cx, regs->dx);
+	printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+		regs->si, regs->di, regs->bp, sp);
+	printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
+	       (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+
+	if (!all)
+		return;
+
+	cr0 = read_cr0();
+	cr2 = read_cr2();
+	cr3 = read_cr3();
+	cr4 = read_cr4_safe();
+	printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+			cr0, cr2, cr3, cr4);
+
+	get_debugreg(d0, 0);
+	get_debugreg(d1, 1);
+	get_debugreg(d2, 2);
+	get_debugreg(d3, 3);
+	printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+			d0, d1, d2, d3);
+
+	get_debugreg(d6, 6);
+	get_debugreg(d7, 7);
+	printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
+			d6, d7);
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+	BUG_ON(dead_task->mm);
+	release_vm86_irqs(dead_task);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+	unlazy_fpu(tsk);
+}
+
+int copy_thread(unsigned long clone_flags, unsigned long sp,
+	unsigned long unused,
+	struct task_struct *p, struct pt_regs *regs)
+{
+	struct pt_regs *childregs;
+	struct task_struct *tsk;
+	int err;
+
+	childregs = task_pt_regs(p);
+	*childregs = *regs;
+	childregs->ax = 0;
+	childregs->sp = sp;
+
+	p->thread.sp = (unsigned long) childregs;
+	p->thread.sp0 = (unsigned long) (childregs+1);
+
+	p->thread.ip = (unsigned long) ret_from_fork;
+
+	task_user_gs(p) = get_user_gs(regs);
+
+	p->fpu_counter = 0;
+	p->thread.io_bitmap_ptr = NULL;
+	tsk = current;
+	err = -ENOMEM;
+
+	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+#ifdef TIF_CSTAR
+	if (test_tsk_thread_flag(tsk, TIF_CSTAR))
+		p->thread.ip = (unsigned long) cstar_ret_from_fork;
+#endif
+	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
+		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+						IO_BITMAP_BYTES, GFP_KERNEL);
+		if (!p->thread.io_bitmap_ptr) {
+			p->thread.io_bitmap_max = 0;
+			return -ENOMEM;
+		}
+		set_tsk_thread_flag(p, TIF_IO_BITMAP);
+	}
+
+	err = 0;
+
+	/*
+	 * Set a new TLS for the child thread?
+	 */
+	if (clone_flags & CLONE_SETTLS)
+		err = do_set_thread_area(p, -1,
+			(struct user_desc __user *)childregs->si, 0);
+
+	p->thread.iopl = current->thread.iopl;
+
+	if (err && p->thread.io_bitmap_ptr) {
+		kfree(p->thread.io_bitmap_ptr);
+		p->thread.io_bitmap_max = 0;
+	}
+	return err;
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+	set_user_gs(regs, 0);
+	regs->fs		= 0;
+	regs->ds		= __USER_DS;
+	regs->es		= __USER_DS;
+	regs->ss		= __USER_DS;
+	regs->cs		= __USER_CS;
+	regs->ip		= new_ip;
+	regs->sp		= new_sp;
+	/*
+	 * Free the old FP and other extended state
+	 */
+	free_thread_xstate(current);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+/*
+ *	switch_to(x,y) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. Lazy FP saving no longer makes any sense
+ * with modern CPU's, and this simplifies a lot of things (SMP
+ * and UP become the same).
+ *
+ * NOTE! We used to use the x86 hardware context switching. The
+ * reason for not using it any more becomes apparent when you
+ * try to recover gracefully from saved state that is no longer
+ * valid (stale segment register values in particular). With the
+ * hardware task-switch, there is no way to fix up bad state in
+ * a reasonable manner.
+ *
+ * The fact that Intel documents the hardware task-switching to
+ * be slow is a fairly red herring - this code is not noticeably
+ * faster. However, there _is_ some room for improvement here,
+ * so the performance issues may eventually be a valid point.
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+ * The return value (in %ax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	struct thread_struct *prev = &prev_p->thread,
+				 *next = &next_p->thread;
+	int cpu = smp_processor_id();
+#ifndef CONFIG_X86_NO_TSS
+	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+#endif
+	fpu_switch_t fpu;
+#if CONFIG_XEN_COMPAT > 0x030002
+	struct physdev_set_iopl iopl_op;
+	struct physdev_set_iobitmap iobmp_op;
+#else
+	struct physdev_op _pdo[2], *pdo = _pdo;
+#define iopl_op pdo->u.set_iopl
+#define iobmp_op pdo->u.set_iobitmap
+#endif
+	multicall_entry_t _mcl[8], *mcl = _mcl;
+
+	/* XEN NOTE: FS/GS saved in switch_mm(), not here. */
+
+	fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
+
+	/*
+	 * Reload sp0.
+	 * This is load_sp0(tss, next) with a multicall.
+	 */
+	mcl->op      = __HYPERVISOR_stack_switch;
+	mcl->args[0] = __KERNEL_DS;
+	mcl->args[1] = next->sp0;
+	mcl++;
+
+	/*
+	 * Load the per-thread Thread-Local Storage descriptor.
+	 * This is load_TLS(next, cpu) with multicalls.
+	 */
+#define C(i) do {							\
+	if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||	\
+		     next->tls_array[i].b != prev->tls_array[i].b)) {	\
+		mcl->op = __HYPERVISOR_update_descriptor;		\
+		*(u64 *)&mcl->args[0] =	arbitrary_virt_to_machine(	\
+			&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
+		*(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i];	\
+		mcl++;							\
+	}								\
+} while (0)
+	C(0); C(1); C(2);
+#undef C
+
+	if (unlikely(prev->iopl != next->iopl)) {
+		iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
+#if CONFIG_XEN_COMPAT > 0x030002
+		mcl->op      = __HYPERVISOR_physdev_op;
+		mcl->args[0] = PHYSDEVOP_set_iopl;
+		mcl->args[1] = (unsigned long)&iopl_op;
+#else
+		mcl->op      = __HYPERVISOR_physdev_op_compat;
+		pdo->cmd     = PHYSDEVOP_set_iopl;
+		mcl->args[0] = (unsigned long)pdo++;
+#endif
+		mcl++;
+	}
+
+	if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+		set_xen_guest_handle(iobmp_op.bitmap,
+				     (char *)next->io_bitmap_ptr);
+		iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+#if CONFIG_XEN_COMPAT > 0x030002
+		mcl->op      = __HYPERVISOR_physdev_op;
+		mcl->args[0] = PHYSDEVOP_set_iobitmap;
+		mcl->args[1] = (unsigned long)&iobmp_op;
+#else
+		mcl->op      = __HYPERVISOR_physdev_op_compat;
+		pdo->cmd     = PHYSDEVOP_set_iobitmap;
+		mcl->args[0] = (unsigned long)pdo++;
+#endif
+		mcl++;
+	}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+	BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
+#endif
+	BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
+	if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
+		BUG();
+
+	/*
+	 * Now maybe handle debug registers
+	 */
+	if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+		__switch_to_xtra(prev_p, next_p);
+
+	/*
+	 * Leave lazy mode, flushing any hypercalls made here.
+	 * This must be done before restoring TLS segments so
+	 * the GDT and LDT are properly updated, and must be
+	 * done before math_state_restore, so the TS bit is up
+	 * to date.
+	 */
+	arch_end_context_switch(next_p);
+
+	/*
+	 * Restore %gs if needed (which is common)
+	 */
+	if (prev->gs | next->gs)
+		lazy_load_gs(next->gs);
+
+	switch_fpu_finish(next_p, fpu);
+
+	percpu_write(current_task, next_p);
+
+	return prev_p;
+}
+
+#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
+#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
+
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long bp, sp, ip;
+	unsigned long stack_page;
+	int count = 0;
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+	stack_page = (unsigned long)task_stack_page(p);
+	sp = p->thread.sp;
+	if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
+		return 0;
+	/* include/asm-i386/system.h:switch_to() pushes bp last. */
+	bp = *(unsigned long *) sp;
+	do {
+		if (bp < stack_page || bp > top_ebp+stack_page)
+			return 0;
+		ip = *(unsigned long *) (bp+4);
+		if (!in_sched_functions(ip))
+			return ip;
+		bp = *(unsigned long *) bp;
+	} while (count++ < 16);
+	return 0;
+}
+
diff --git a/arch/x86/kernel/process_64-xen.c b/arch/x86/kernel/process_64-xen.c
new file mode 100644
index 0000000..a9e3ce3
--- /dev/null
+++ b/arch/x86/kernel/process_64-xen.c
@@ -0,0 +1,695 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ *  X86-64 port
+ *	Andi Kleen.
+ *
+ *	CPU hotplug support - ashok.raj@intel.com
+ * 
+ *  Jun Nakajima <jun.nakajima@intel.com> 
+ *     Modified for Xen
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <linux/stackprotector.h>
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/tick.h>
+#include <linux/prctl.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/ftrace.h>
+#include <linux/cpuidle.h>
+
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/mmu_context.h>
+#include <asm/prctl.h>
+#include <xen/interface/physdev.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/hardirq.h>
+#include <asm/ia32.h>
+#include <asm/idle.h>
+#include <asm/syscalls.h>
+#include <asm/debugreg.h>
+#include <asm/nmi.h>
+
+asmlinkage extern void ret_from_fork(void);
+
+static DEFINE_PER_CPU(unsigned char, is_idle);
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+	atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+	atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+
+void enter_idle(void)
+{
+	percpu_write(is_idle, 1);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+}
+
+static void __exit_idle(void)
+{
+	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
+		return;
+	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+}
+
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+	/* idle loop has pid 0 */
+	if (current->pid)
+		return;
+	__exit_idle();
+}
+
+#ifndef CONFIG_SMP
+static inline void play_dead(void)
+{
+	BUG();
+}
+#endif
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+	current_thread_info()->status |= TS_POLLING;
+
+	/*
+	 * If we're the non-boot CPU, nothing set the stack canary up
+	 * for us.  CPU0 already has it initialized but no harm in
+	 * doing it again.  This is a good place for updating it, as
+	 * we wont ever return from this function (so the invalid
+	 * canaries already on the stack wont ever trigger).
+	 */
+	boot_init_stack_canary();
+
+	/* endless idle loop with no priority at all */
+	while (1) {
+		tick_nohz_idle_enter();
+		while (!need_resched()) {
+
+			rmb();
+
+			if (cpu_is_offline(smp_processor_id()))
+				play_dead();
+			/*
+			 * Idle routines should keep interrupts disabled
+			 * from here on, until they go to idle.
+			 * Otherwise, idle callbacks can misfire.
+			 */
+			local_touch_nmi();
+			local_irq_disable();
+			enter_idle();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
+
+			/* enter_idle() needs rcu for notifiers */
+			rcu_idle_enter();
+
+			if (cpuidle_idle_call())
+				xen_idle();
+
+			rcu_idle_exit();
+			start_critical_timings();
+
+			/* In many cases the interrupt that ended idle
+			   has already called exit_idle. But some idle
+			   loops can be woken up without interrupt. */
+			__exit_idle();
+		}
+
+		tick_nohz_idle_exit();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+	}
+}
+
+/* Prints also some state that isn't saved in the pt_regs */
+void __show_regs(struct pt_regs *regs, int all)
+{
+	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+	unsigned long d0, d1, d2, d3, d6, d7;
+	unsigned int fsindex, gsindex;
+	unsigned int ds, cs, es;
+
+	show_regs_common();
+	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+	printk_address(regs->ip, 1);
+	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
+			regs->sp, regs->flags);
+	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
+	       regs->ax, regs->bx, regs->cx);
+	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
+	       regs->dx, regs->si, regs->di);
+	printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
+	       regs->bp, regs->r8, regs->r9);
+	printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
+	       regs->r10, regs->r11, regs->r12);
+	printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
+	       regs->r13, regs->r14, regs->r15);
+
+	asm("movl %%ds,%0" : "=r" (ds));
+	asm("movl %%cs,%0" : "=r" (cs));
+	asm("movl %%es,%0" : "=r" (es));
+	asm("mov %%fs,%0" : "=r" (fsindex));
+	asm("mov %%gs,%0" : "=r" (gsindex));
+
+	rdmsrl(MSR_FS_BASE, fs);
+	rdmsrl(MSR_GS_BASE, gs);
+	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+
+	if (!all)
+		return;
+
+	cr0 = read_cr0();
+	cr2 = read_cr2();
+	cr3 = read_cr3();
+	cr4 = read_cr4();
+
+	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+	       fs, fsindex, gs, gsindex, shadowgs);
+	printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+			es, cr0);
+	printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+			cr4);
+
+	get_debugreg(d0, 0);
+	get_debugreg(d1, 1);
+	get_debugreg(d2, 2);
+	printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+	get_debugreg(d3, 3);
+	get_debugreg(d6, 6);
+	get_debugreg(d7, 7);
+	printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+}
+
+void xen_load_gs_index(unsigned gs)
+{
+	WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
+}
+EXPORT_SYMBOL(xen_load_gs_index);
+
+void release_thread(struct task_struct *dead_task)
+{
+	if (dead_task->mm) {
+		if (dead_task->mm->context.size) {
+			printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
+					dead_task->comm,
+					dead_task->mm->context.ldt,
+					dead_task->mm->context.size);
+			BUG();
+		}
+	}
+}
+
+static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+{
+	struct user_desc ud = {
+		.base_addr = addr,
+		.limit = 0xfffff,
+		.seg_32bit = 1,
+		.limit_in_pages = 1,
+		.useable = 1,
+	};
+	struct desc_struct *desc = t->thread.tls_array;
+	desc += tls;
+	fill_ldt(desc, &ud);
+}
+
+static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+{
+	return get_desc_base(&t->thread.tls_array[tls]);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+	unlazy_fpu(tsk);
+}
+
+int copy_thread(unsigned long clone_flags, unsigned long sp,
+		unsigned long unused,
+	struct task_struct *p, struct pt_regs *regs)
+{
+	int err;
+	struct pt_regs *childregs;
+	struct task_struct *me = current;
+
+	childregs = ((struct pt_regs *)
+			(THREAD_SIZE + task_stack_page(p))) - 1;
+	*childregs = *regs;
+
+	childregs->ax = 0;
+	if (user_mode(regs))
+		childregs->sp = sp;
+	else
+		childregs->sp = (unsigned long)childregs;
+
+	p->thread.sp = (unsigned long) childregs;
+	p->thread.sp0 = (unsigned long) (childregs+1);
+
+	set_tsk_thread_flag(p, TIF_FORK);
+
+	p->fpu_counter = 0;
+	p->thread.io_bitmap_ptr = NULL;
+
+	savesegment(gs, p->thread.gsindex);
+	p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
+	savesegment(fs, p->thread.fsindex);
+	p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
+	savesegment(es, p->thread.es);
+	savesegment(ds, p->thread.ds);
+
+	err = -ENOMEM;
+	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
+		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
+						  IO_BITMAP_BYTES, GFP_KERNEL);
+		if (!p->thread.io_bitmap_ptr) {
+			p->thread.io_bitmap_max = 0;
+			return -ENOMEM;
+		}
+		set_tsk_thread_flag(p, TIF_IO_BITMAP);
+	}
+
+	/*
+	 * Set a new TLS for the child thread?
+	 */
+	if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_IA32_EMULATION
+		if (test_thread_flag(TIF_IA32))
+			err = do_set_thread_area(p, -1,
+				(struct user_desc __user *)childregs->si, 0);
+		else
+#endif
+			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+		if (err)
+			goto out;
+	}
+        p->thread.iopl = current->thread.iopl;
+
+	err = 0;
+out:
+	if (err && p->thread.io_bitmap_ptr) {
+		kfree(p->thread.io_bitmap_ptr);
+		p->thread.io_bitmap_max = 0;
+	}
+
+	return err;
+}
+
+static void
+start_thread_common(struct pt_regs *regs, unsigned long new_ip,
+		    unsigned long new_sp,
+		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
+{
+	loadsegment(fs, 0);
+	loadsegment(es, _ds);
+	loadsegment(ds, _ds);
+	load_gs_index(0);
+	regs->ip		= new_ip;
+	regs->sp		= new_sp;
+	regs->cs		= _cs;
+	regs->ss		= _ss;
+	regs->flags		= X86_EFLAGS_IF;
+	/*
+	 * Free the old FP and other extended state
+	 */
+	free_thread_xstate(current);
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+	start_thread_common(regs, new_ip, new_sp,
+			    __USER_CS, __USER_DS, 0);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+{
+	start_thread_common(regs, new_ip, new_sp,
+			    __USER32_CS, __USER32_DS, __USER32_DS);
+}
+#endif
+
+/*
+ *	switch_to(x,y) should switch tasks from x to y.
+ *
+ * This could still be optimized:
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ *
+ * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
+ */
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+	struct thread_struct *prev = &prev_p->thread;
+	struct thread_struct *next = &next_p->thread;
+	int cpu = smp_processor_id();
+#ifndef CONFIG_X86_NO_TSS
+	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+#endif
+	fpu_switch_t fpu;
+#if CONFIG_XEN_COMPAT > 0x030002
+	struct physdev_set_iopl iopl_op;
+	struct physdev_set_iobitmap iobmp_op;
+#else
+	struct physdev_op _pdo[2], *pdo = _pdo;
+#define iopl_op pdo->u.set_iopl
+#define iobmp_op pdo->u.set_iobitmap
+#endif
+	multicall_entry_t _mcl[8], *mcl = _mcl;
+
+	fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
+
+	/*
+	 * Reload sp0.
+	 * This is load_sp0(tss, next) with a multicall.
+	 */
+	mcl->op      = __HYPERVISOR_stack_switch;
+	mcl->args[0] = __KERNEL_DS;
+	mcl->args[1] = next->sp0;
+	mcl++;
+
+	/*
+	 * Load the per-thread Thread-Local Storage descriptor.
+	 * This is load_TLS(next, cpu) with multicalls.
+	 */
+#define C(i) do {							\
+	if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||	\
+		     next->tls_array[i].b != prev->tls_array[i].b)) {	\
+		mcl->op      = __HYPERVISOR_update_descriptor;		\
+		mcl->args[0] = arbitrary_virt_to_machine(		\
+			&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
+		mcl->args[1] = *(u64 *)&next->tls_array[i];		\
+		mcl++;							\
+	}								\
+} while (0)
+	C(0); C(1); C(2);
+#undef C
+
+	if (unlikely(prev->iopl != next->iopl)) {
+		iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
+#if CONFIG_XEN_COMPAT > 0x030002
+		mcl->op      = __HYPERVISOR_physdev_op;
+		mcl->args[0] = PHYSDEVOP_set_iopl;
+		mcl->args[1] = (unsigned long)&iopl_op;
+#else
+		mcl->op      = __HYPERVISOR_physdev_op_compat;
+		pdo->cmd     = PHYSDEVOP_set_iopl;
+		mcl->args[0] = (unsigned long)pdo++;
+#endif
+		mcl++;
+	}
+
+	if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+		set_xen_guest_handle(iobmp_op.bitmap,
+				     (char *)next->io_bitmap_ptr);
+		iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+#if CONFIG_XEN_COMPAT > 0x030002
+		mcl->op      = __HYPERVISOR_physdev_op;
+		mcl->args[0] = PHYSDEVOP_set_iobitmap;
+		mcl->args[1] = (unsigned long)&iobmp_op;
+#else
+		mcl->op      = __HYPERVISOR_physdev_op_compat;
+		pdo->cmd     = PHYSDEVOP_set_iobitmap;
+		mcl->args[0] = (unsigned long)pdo++;
+#endif
+		mcl++;
+	}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+	BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
+#endif
+	BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
+	if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
+		BUG();
+
+	/*
+	 * Switch DS and ES.
+	 * This won't pick up thread selector changes, but I guess that is ok.
+	 */
+	if (unlikely(next->es))
+		loadsegment(es, next->es);
+
+	if (unlikely(next->ds))
+		loadsegment(ds, next->ds);
+
+	/*
+	 * Leave lazy mode, flushing any hypercalls made here.
+	 * This must be done before restoring TLS segments so
+	 * the GDT and LDT are properly updated, and must be
+	 * done before math_state_restore, so the TS bit is up
+	 * to date.
+	 */
+	arch_end_context_switch(next_p);
+
+	/*
+	 * Switch FS and GS.
+	 *
+	 * Segment register != 0 always requires a reload.  Also
+	 * reload when it has changed.  When prev process used 64bit
+	 * base always reload to avoid an information leak.
+	 */
+	if (unlikely(next->fsindex))
+		loadsegment(fs, next->fsindex);
+
+	if (next->fs)
+		WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
+	
+	if (unlikely(next->gsindex))
+		load_gs_index(next->gsindex);
+
+	if (next->gs)
+		WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
+
+	switch_fpu_finish(next_p, fpu);
+
+	/*
+	 * Switch the PDA context.
+	 */
+	percpu_write(current_task, next_p);
+
+	percpu_write(kernel_stack,
+		  (unsigned long)task_stack_page(next_p) +
+		  THREAD_SIZE - KERNEL_STACK_OFFSET);
+
+	/*
+	 * Now maybe reload the debug registers
+	 */
+	if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
+		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
+		__switch_to_xtra(prev_p, next_p);
+
+	return prev_p;
+}
+
+void set_personality_64bit(void)
+{
+	/* inherit personality from parent */
+
+	/* Make sure to be in 64bit mode */
+	clear_thread_flag(TIF_IA32);
+
+	/* Ensure the corresponding mm is not marked. */
+	if (current->mm)
+		current->mm->context.ia32_compat = 0;
+
+	/* TBD: overwrites user setup. Should have two bits.
+	   But 64bit processes have always behaved this way,
+	   so it's not too bad. The main problem is just that
+	   32bit childs are affected again. */
+	current->personality &= ~READ_IMPLIES_EXEC;
+}
+
+void set_personality_ia32(void)
+{
+	/* inherit personality from parent */
+
+	/* Make sure to be in 32bit mode */
+	set_thread_flag(TIF_IA32);
+	current->personality |= force_personality32;
+
+	/* Mark the associated mm as containing 32-bit tasks. */
+	if (current->mm)
+		current->mm->context.ia32_compat = 1;
+
+	/* Prepare the first "return" to user space */
+	current_thread_info()->status |= TS_COMPAT;
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long stack;
+	u64 fp, ip;
+	int count = 0;
+
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+	stack = (unsigned long)task_stack_page(p);
+	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
+		return 0;
+	fp = *(u64 *)(p->thread.sp);
+	do {
+		if (fp < (unsigned long)stack ||
+		    fp >= (unsigned long)stack+THREAD_SIZE)
+			return 0;
+		ip = *(u64 *)(fp+8);
+		if (!in_sched_functions(ip))
+			return ip;
+		fp = *(u64 *)fp;
+	} while (count++ < 16);
+	return 0;
+}
+
+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+{
+	int ret = 0;
+	int doit = task == current;
+	int cpu;
+
+	switch (code) {
+	case ARCH_SET_GS:
+		if (addr >= TASK_SIZE_OF(task))
+			return -EPERM;
+		cpu = get_cpu();
+		/* handle small bases via the GDT because that's faster to
+		   switch. */
+		if (addr <= 0xffffffff) {
+			set_32bit_tls(task, GS_TLS, addr);
+			if (doit) {
+				load_TLS(&task->thread, cpu);
+				load_gs_index(GS_TLS_SEL);
+			}
+			task->thread.gsindex = GS_TLS_SEL;
+			task->thread.gs = 0;
+		} else {
+			task->thread.gsindex = 0;
+			task->thread.gs = addr;
+			if (doit) {
+				load_gs_index(0);
+				ret = HYPERVISOR_set_segment_base(
+					SEGBASE_GS_USER, addr);
+			}
+		}
+		put_cpu();
+		break;
+	case ARCH_SET_FS:
+		/* Not strictly needed for fs, but do it for symmetry
+		   with gs */
+		if (addr >= TASK_SIZE_OF(task))
+			return -EPERM;
+		cpu = get_cpu();
+		/* handle small bases via the GDT because that's faster to
+		   switch. */
+		if (addr <= 0xffffffff) {
+			set_32bit_tls(task, FS_TLS, addr);
+			if (doit) {
+				load_TLS(&task->thread, cpu);
+				loadsegment(fs, FS_TLS_SEL);
+			}
+			task->thread.fsindex = FS_TLS_SEL;
+			task->thread.fs = 0;
+		} else {
+			task->thread.fsindex = 0;
+			task->thread.fs = addr;
+			if (doit) {
+				/* set the selector to 0 to not confuse
+				   __switch_to */
+				loadsegment(fs, 0);
+                                ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
+								  addr);
+			}
+		}
+		put_cpu();
+		break;
+	case ARCH_GET_FS: {
+		unsigned long base;
+		if (task->thread.fsindex == FS_TLS_SEL)
+			base = read_32bit_tls(task, FS_TLS);
+		else if (doit)
+			rdmsrl(MSR_FS_BASE, base);
+		else
+			base = task->thread.fs;
+		ret = put_user(base, (unsigned long __user *)addr);
+		break;
+	}
+	case ARCH_GET_GS: {
+		unsigned long base;
+		unsigned gsindex;
+		if (task->thread.gsindex == GS_TLS_SEL)
+			base = read_32bit_tls(task, GS_TLS);
+		else if (doit) {
+			savesegment(gs, gsindex);
+			if (gsindex)
+				rdmsrl(MSR_KERNEL_GS_BASE, base);
+			else
+				base = task->thread.gs;
+		} else
+			base = task->thread.gs;
+		ret = put_user(base, (unsigned long __user *)addr);
+		break;
+	}
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+long sys_arch_prctl(int code, unsigned long addr)
+{
+	return do_arch_prctl(current, code, addr);
+}
+
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 03920a1..e22394c 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -4,9 +4,7 @@
 #include <linux/pci.h>
 #include <linux/irq.h>
 
-#include <asm/hpet.h>
-
-#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
 
 static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 {
@@ -34,10 +32,21 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 	if (!(word & (1 << 13))) {
 		dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
 			"disabling irq balancing and affinity\n");
+#ifndef CONFIG_XEN
 		noirqdebug_setup("");
 #ifdef CONFIG_PROC_FS
 		no_irq_affinity = 1;
 #endif
+#else
+		{
+			struct xen_platform_op op = {
+				.cmd = XENPF_platform_quirk,
+				.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING
+			};
+
+			WARN_ON(HYPERVISOR_platform_op(&op));
+		}
+#endif
 	}
 
 	/* put back the original value for config space*/
@@ -53,6 +62,8 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH,
 #endif
 
 #if defined(CONFIG_HPET_TIMER)
+#include <asm/hpet.h>
+
 unsigned long force_hpet_address;
 
 static enum {
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 36818f8..06ee365 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -87,14 +87,32 @@ relocate_kernel:
 	movl	PTR(PA_PGD)(%ebp), %eax
 	movl	%eax, %cr3
 
+	/* setup idt */
+	lidtl	idt_48 - relocate_kernel(%edi)
+
+	/* setup gdt */
+	leal	gdt - relocate_kernel(%edi), %eax
+	movl	%eax, (gdt_48 - relocate_kernel) + 2(%edi)
+	lgdtl	gdt_48 - relocate_kernel(%edi)
+
+	/* setup data segment registers */
+	mov	$(gdt_ds - gdt), %eax
+	mov	%eax, %ds
+	mov	%eax, %es
+	mov	%eax, %fs
+	mov	%eax, %gs
+	mov	%eax, %ss
+
 	/* setup a new stack at the end of the physical control page */
 	lea	PAGE_SIZE(%edi), %esp
 
-	/* jump to identity mapped page */
+	/* load new code segment and jump to identity mapped page */
+	pushl	$0
+	pushl	$(gdt_cs - gdt)
 	movl    %edi, %eax
 	addl    $(identity_mapped - relocate_kernel), %eax
 	pushl   %eax
-	ret
+	iretl
 
 identity_mapped:
 	/* set return address to 0 if not preserving context */
@@ -273,5 +291,22 @@ swap_pages:
 	popl	%ebp
 	ret
 
+	.align	16
+gdt:
+	.quad	0x0000000000000000	/* NULL descriptor */
+gdt_cs:
+	.quad	0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
+gdt_ds:
+	.quad	0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */
+gdt_end:
+
+gdt_48:
+	.word	gdt_end - gdt - 1	/* limit */
+	.long	0			/* base - filled in by code above */
+
+idt_48:
+	.word	0			/* limit */
+	.long	0			/* base */
+
 	.globl kexec_control_code_size
 .set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 7a6f3b3..b591fca 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -91,13 +91,30 @@ relocate_kernel:
 	/* Switch to the identity mapped page tables */
 	movq	%r9, %cr3
 
+	/* setup idt */
+	lidtq	idt_80 - relocate_kernel(%r8)
+
+	/* setup gdt */
+	leaq	gdt - relocate_kernel(%r8), %rax
+	movq	%rax, (gdt_80 - relocate_kernel) + 2(%r8)
+	lgdtq	gdt_80 - relocate_kernel(%r8)
+
+	/* setup data segment registers */
+	xorl	%eax, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %fs
+	movl	%eax, %gs
+	movl	%eax, %ss
+
 	/* setup a new stack at the end of the physical control page */
 	lea	PAGE_SIZE(%r8), %rsp
 
-	/* jump to identity mapped page */
+	/* load new code segment and jump to identity mapped page */
 	addq	$(identity_mapped - relocate_kernel), %r8
+	pushq	$(gdt_cs - gdt)
 	pushq	%r8
-	ret
+	lretq
 
 identity_mapped:
 	/* set return address to 0 if not preserving context */
@@ -264,5 +281,20 @@ swap_pages:
 3:
 	ret
 
+	.align  16
+gdt:
+	.quad	0x0000000000000000	/* NULL descriptor */
+gdt_cs:
+	.quad   0x00af9a000000ffff
+gdt_end:
+
+gdt_80:
+	.word	gdt_end - gdt - 1	/* limit */
+	.quad	0			/* base - filled in by code above */
+
+idt_80:
+	.word	0			/* limit */
+	.quad	0			/* base */
+
 	.globl kexec_control_code_size
 .set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 2a26819..7cc4a75 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -1,3 +1,7 @@
+#ifdef CONFIG_XEN
+# define e820 machine_e820
+# include <asm/hypervisor.h>
+#endif
 #include <linux/ioport.h>
 #include <asm/e820.h>
 
@@ -37,6 +41,10 @@ static void remove_e820_regions(struct resource *avail)
 
 void arch_remove_reservations(struct resource *avail)
 {
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return;
+#endif
 	/* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
 	if (avail->flags & IORESOURCE_MEM) {
 		if (avail->start < BIOS_END)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index af6db6e..4598f0b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -30,6 +30,7 @@ EXPORT_SYMBOL(cmos_lock);
 DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
 /*
  * In order to set the CMOS clock precisely, set_rtc_mmss has to be
  * called 500 ms after the second nowtime has started, because when
@@ -155,6 +156,7 @@ unsigned long mach_get_cmos_time(void)
 
 	return mktime(year, mon, day, hour, min, sec);
 }
+#endif /* CONFIG_XEN_UNPRIVILEGED_GUEST */
 
 /* Routines for accessing the CMOS RAM/RTC. */
 unsigned char rtc_cmos_read(unsigned char addr)
@@ -202,6 +204,7 @@ unsigned long long native_read_tsc(void)
 EXPORT_SYMBOL(native_read_tsc);
 
 
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
 static struct resource rtc_resources[] = {
 	[0] = {
 		.start	= RTC_PORT(0),
@@ -247,6 +250,11 @@ static __init int add_rtc_cmos(void)
 	if (mrst_identify_cpu())
 		return -ENODEV;
 
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return -ENODEV;
+#endif
+
 	platform_device_register(&rtc_device);
 	dev_info(&rtc_device.dev,
 		 "registered platform RTC device (no PNP device found)\n");
@@ -254,3 +262,4 @@ static __init int add_rtc_cmos(void)
 	return 0;
 }
 device_initcall(add_rtc_cmos);
+#endif /* CONFIG_XEN_UNPRIVILEGED_GUEST */
diff --git a/arch/x86/kernel/setup-xen.c b/arch/x86/kernel/setup-xen.c
new file mode 100644
index 0000000..52bbdfb
--- /dev/null
+++ b/arch/x86/kernel/setup-xen.c
@@ -0,0 +1,1477 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ *  Memory region support
+ *	David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ *  Added E820 sanitization routine (removes overlapping memory regions);
+ *  Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ *    Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ *  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *  Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/sfi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/delay.h>
+
+#include <linux/kallsyms.h>
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+
+#include <linux/percpu.h>
+#include <linux/crash_dump.h>
+#include <linux/tboot.h>
+
+#include <video/edid.h>
+
+#include <asm/mtrr.h>
+#include <asm/apic.h>
+#include <asm/trampoline.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/sections.h>
+#include <asm/dmi.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/setup_arch.h>
+#include <asm/bios_ebda.h>
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/bugs.h>
+
+#include <asm/system.h>
+#include <asm/vsyscall.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+#include <asm/paravirt.h>
+#include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
+
+#include <asm/percpu.h>
+#include <asm/topology.h>
+#include <asm/apicdef.h>
+#include <asm/amd_nb.h>
+#ifdef CONFIG_X86_64
+#include <asm/numa_64.h>
+#endif
+#include <asm/mce.h>
+#include <asm/alternative.h>
+#include <asm/prom.h>
+
+#ifdef CONFIG_XEN
+#include <asm/hypervisor.h>
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/nmi.h>
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+#include <xen/firmware.h>
+#include <xen/xencons.h>
+
+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xen_panic_block = {
+	xen_panic_event, NULL, 0 /* try to go last */
+};
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+static unsigned long *pfn_to_mfn_frame_list_list, **pfn_to_mfn_frame_list;
+
+/* Raw start-of-day parameters from the hypervisor. */
+start_info_t *xen_start_info;
+EXPORT_SYMBOL(xen_start_info);
+#endif
+
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
+#ifdef CONFIG_DMI
+RESERVE_BRK(dmi_alloc, 65536);
+#endif
+
+
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_64
+int default_cpu_present_to_apicid(int mps_cpu)
+{
+	return __default_cpu_present_to_apicid(mps_cpu);
+}
+
+int default_check_phys_apicid_present(int phys_apicid)
+{
+	return __default_check_phys_apicid_present(phys_apicid);
+}
+#endif
+
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+#else /* CONFIG_XEN */
+/*
+ * Initialise the list of the frames that specify the list of
+ * frames that make up the p2m table. Used by save/restore and
+ * kexec/crash.
+ */
+#ifdef CONFIG_PM_SLEEP
+void
+#else
+static void __init
+#endif
+setup_pfn_to_mfn_frame_list(typeof(__alloc_bootmem) *__alloc_bootmem)
+{
+	unsigned long i, j, size;
+	unsigned int k, fpp = PAGE_SIZE / sizeof(unsigned long);
+
+	size = (max_pfn + fpp - 1) / fpp;
+	size = (size + fpp - 1) / fpp;
+	++size; /* include a zero terminator for crash tools */
+	size *= sizeof(unsigned long);
+	if (__alloc_bootmem)
+		pfn_to_mfn_frame_list_list = alloc_bootmem_pages(size);
+	if (size > PAGE_SIZE
+	    && xen_create_contiguous_region((unsigned long)
+					    pfn_to_mfn_frame_list_list,
+					    get_order(size), 0))
+		BUG();
+	size -= sizeof(unsigned long);
+	if (__alloc_bootmem)
+		pfn_to_mfn_frame_list = alloc_bootmem(size);
+
+	for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
+		if (j == fpp)
+			j = 0;
+		if (j == 0) {
+			k++;
+			BUG_ON(k * sizeof(unsigned long) >= size);
+			if (__alloc_bootmem)
+				pfn_to_mfn_frame_list[k] =
+					alloc_bootmem_pages(PAGE_SIZE);
+			pfn_to_mfn_frame_list_list[k] =
+				virt_to_mfn(pfn_to_mfn_frame_list[k]);
+		}
+		pfn_to_mfn_frame_list[k][j] =
+			virt_to_mfn(&phys_to_machine_mapping[i]);
+	}
+	HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
+	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+		virt_to_mfn(pfn_to_mfn_frame_list_list);
+}
+#endif
+
+/*
+ * Machine setup..
+ */
+static struct resource data_resource = {
+	.name	= "Kernel data",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+	.name	= "Kernel code",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+	.name	= "Kernel bss",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+
+#ifdef CONFIG_X86_32
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = { .wp_works_ok = 1, .hard_math = 1 };
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { .wp_works_ok = 1, .hard_math = 1 };
+EXPORT_SYMBOL(boot_cpu_data);
+#ifndef CONFIG_XEN
+static void set_mca_bus(int x)
+{
+#ifdef CONFIG_MCA
+	MCA_bus = x;
+#endif
+}
+
+unsigned int def_to_bigsmp;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+#endif
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+struct ist_info ist_info;
+EXPORT_SYMBOL(ist_info);
+#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
+struct ist_info ist_info;
+#endif
+
+#else
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+	.x86_phys_bits = MAX_PHYSMEM_BITS,
+};
+EXPORT_SYMBOL(boot_cpu_data);
+#endif
+
+
+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
+#endif
+
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+unsigned long saved_video_mode;
+
+#define RAMDISK_IMAGE_START_MASK	0x07FF
+#define RAMDISK_PROMPT_FLAG		0x8000
+#define RAMDISK_LOAD_FLAG		0x4000
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+#ifdef CONFIG_CMDLINE_BOOL
+static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+#endif
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+#ifndef CONFIG_XEN
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void __init copy_edd(void)
+{
+     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+	    sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+     edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#endif
+#else
+static inline void __init copy_edd(void)
+{
+}
+#endif
+
+void * __init extend_brk(size_t size, size_t align)
+{
+	size_t mask = align - 1;
+	void *ret;
+
+	BUG_ON(_brk_start == 0);
+	BUG_ON(align & mask);
+
+	_brk_end = (_brk_end + mask) & ~mask;
+	BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+	ret = (void *)_brk_end;
+	_brk_end += size;
+
+	memset(ret, 0, size);
+
+	return ret;
+}
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+static void __init init_gbpages(void)
+{
+	if (direct_gbpages && cpu_has_gbpages)
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+	else
+		direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+static void __init cleanup_highmap(void)
+{
+}
+#endif
+
+static void __init reserve_brk(void)
+{
+	if (_brk_end > _brk_start)
+		memblock_reserve(__pa(_brk_start),
+				 __pa(_brk_end) - __pa(_brk_start));
+
+	/* Mark brk area as locked down and no longer taking any
+	   new allocations */
+	_brk_start = 0;
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+#define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
+static void __init relocate_initrd(void)
+{
+#ifndef CONFIG_XEN
+	/* Assume only end is not page aligned */
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 area_size     = PAGE_ALIGN(ramdisk_size);
+	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+	u64 ramdisk_here;
+	unsigned long slop, clen, mapaddr;
+	char *p, *q;
+
+	/* We need to move the initrd down into lowmem */
+	ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
+					 PAGE_SIZE);
+
+	if (!ramdisk_here)
+		panic("Cannot find place for new RAMDISK of size %lld\n",
+			 ramdisk_size);
+
+	/* Note: this includes all the lowmem currently occupied by
+	   the initrd, we rely on that fact to keep the data intact. */
+	memblock_reserve(ramdisk_here, area_size);
+	initrd_start = ramdisk_here + PAGE_OFFSET;
+	initrd_end   = initrd_start + ramdisk_size;
+	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+			 ramdisk_here, ramdisk_here + ramdisk_size);
+
+	q = (char *)initrd_start;
+
+	/* Copy any lowmem portion of the initrd */
+	if (ramdisk_image < end_of_lowmem) {
+		clen = end_of_lowmem - ramdisk_image;
+		p = (char *)__va(ramdisk_image);
+		memcpy(q, p, clen);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+
+	/* Copy the highmem portion of the initrd */
+	while (ramdisk_size) {
+		slop = ramdisk_image & ~PAGE_MASK;
+		clen = ramdisk_size;
+		if (clen > MAX_MAP_CHUNK-slop)
+			clen = MAX_MAP_CHUNK-slop;
+		mapaddr = ramdisk_image & PAGE_MASK;
+		p = early_memremap(mapaddr, clen+slop);
+		memcpy(q, p+slop, clen);
+		early_iounmap(p, clen+slop);
+		q += clen;
+		ramdisk_image += clen;
+		ramdisk_size  -= clen;
+	}
+	/* high pages is not converted by early_res_to_bootmem */
+	ramdisk_image = boot_params.hdr.ramdisk_image;
+	ramdisk_size  = boot_params.hdr.ramdisk_size;
+	printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
+		" %08llx - %08llx\n",
+		ramdisk_image, ramdisk_image + ramdisk_size - 1,
+		ramdisk_here, ramdisk_here + ramdisk_size - 1);
+#else
+	printk(KERN_ERR "initrd extends beyond end of memory "
+	       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+	       xen_initrd_start + xen_start_info->mod_len,
+	       max_low_pfn_mapped << PAGE_SHIFT);
+	initrd_start = 0;
+#endif
+}
+
+static void __init reserve_initrd(void)
+{
+	/* Assume only end is not page aligned */
+#ifndef CONFIG_XEN
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+
+	if (!boot_params.hdr.type_of_loader ||
+	    !ramdisk_image || !ramdisk_size)
+		return;		/* No initrd provided by bootloader */
+#else
+	unsigned long ramdisk_image = xen_initrd_start;
+	unsigned long ramdisk_size  = xen_start_info->mod_len;
+	unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+	unsigned long end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+
+	if (!xen_start_info->mod_start || !ramdisk_size)
+		return;		/* No initrd provided by bootloader */
+#endif
+
+	initrd_start = 0;
+
+	if (ramdisk_size >= (end_of_lowmem>>1)) {
+		memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+		printk(KERN_ERR "initrd too large to handle, "
+		       "disabling initrd\n");
+		return;
+	}
+
+	printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
+			ramdisk_end);
+
+
+	if (ramdisk_end <= end_of_lowmem) {
+		/* All in lowmem, easy case */
+		/*
+		 * don't need to reserve again, already reserved early
+		 * in i386_start_kernel
+		 */
+		initrd_start = ramdisk_image + PAGE_OFFSET;
+		initrd_end = initrd_start + ramdisk_size;
+#ifdef CONFIG_X86_64_XEN
+		initrd_below_start_ok = 1;
+#endif
+	} else {
+		relocate_initrd();
+		memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+	}
+#ifdef CONFIG_ACPI_INITRD_TABLE_OVERRIDE
+	acpi_initrd_offset = acpi_initrd_table_override((void *)initrd_start,
+							(void *)initrd_end);
+	if (!acpi_initrd_offset)
+		return;
+	printk(KERN_INFO "Found acpi tables of size: %lu at 0x%lx\n",
+	       acpi_initrd_offset, initrd_start);
+	initrd_start += acpi_initrd_offset;
+	return;
+#endif
+}
+#else
+static void __init reserve_initrd(void)
+{
+}
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+static void __init parse_setup_data(void)
+{
+#ifndef CONFIG_XEN
+	struct setup_data *data;
+	u64 pa_data;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		u32 data_len, map_len;
+
+		map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
+			      (u64)sizeof(struct setup_data));
+		data = early_memremap(pa_data, map_len);
+		data_len = data->len + sizeof(struct setup_data);
+		if (data_len > map_len) {
+			early_iounmap(data, map_len);
+			data = early_memremap(pa_data, data_len);
+			map_len = data_len;
+		}
+
+		switch (data->type) {
+		case SETUP_E820_EXT:
+			parse_e820_ext(data);
+			break;
+		case SETUP_DTB:
+			add_dtb(pa_data);
+			break;
+		default:
+			break;
+		}
+		pa_data = data->next;
+		early_iounmap(data, map_len);
+	}
+#endif
+}
+
+static void __init e820_reserve_setup_data(void)
+{
+#ifndef CONFIG_XEN
+	struct setup_data *data;
+	u64 pa_data;
+	int found = 0;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_memremap(pa_data, sizeof(*data));
+		e820_update_range(pa_data, sizeof(*data)+data->len,
+			 E820_RAM, E820_RESERVED_KERN);
+		found = 1;
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
+	if (!found)
+		return;
+
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	memcpy(&e820_saved, &e820, sizeof(struct e820map));
+	printk(KERN_INFO "extended physical RAM map:\n");
+	e820_print_map("reserve setup_data");
+#endif
+}
+
+static void __init memblock_x86_reserve_range_setup_data(void)
+{
+#ifndef CONFIG_XEN
+	struct setup_data *data;
+	u64 pa_data;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_memremap(pa_data, sizeof(*data));
+		memblock_reserve(pa_data, sizeof(*data) + data->len);
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
+#endif
+}
+
+#ifndef CONFIG_XEN
+/*
+ * --------- Crashkernel reservation ------------------------------
+ */
+
+#ifdef CONFIG_KEXEC
+
+static inline unsigned long long get_total_mem(void)
+{
+	unsigned long long total;
+
+	total = max_pfn - min_low_pfn;
+
+	return total << PAGE_SHIFT;
+}
+
+/*
+ * Keep the crash kernel below this limit.  On 32 bits earlier kernels
+ * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
+ * limit once kexec-tools are fixed.
+ */
+#ifdef CONFIG_X86_32
+# define CRASH_KERNEL_ADDR_MAX	(512 << 20)
+#else
+# define CRASH_KERNEL_ADDR_MAX	(896 << 20)
+#endif
+
+static void __init reserve_crashkernel(void)
+{
+	unsigned long long total_mem;
+	unsigned long long crash_size, crash_base;
+	int ret;
+
+	total_mem = get_total_mem();
+
+	ret = parse_crashkernel(boot_command_line, total_mem,
+			&crash_size, &crash_base);
+	if (ret != 0 || crash_size <= 0)
+		return;
+
+	/* 0 means: find the address automatically */
+	if (crash_base <= 0) {
+		const unsigned long long alignment = 16<<20;	/* 16M */
+
+		/*
+		 *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
+		 */
+		crash_base = memblock_find_in_range(alignment,
+			       CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
+
+		if (!crash_base) {
+			pr_info("crashkernel reservation failed - No suitable area found.\n");
+			return;
+		}
+	} else {
+		unsigned long long start;
+
+		start = memblock_find_in_range(crash_base,
+				 crash_base + crash_size, crash_size, 1<<20);
+		if (start != crash_base) {
+			pr_info("crashkernel reservation failed - memory is in use.\n");
+			return;
+		}
+	}
+	memblock_reserve(crash_base, crash_size);
+
+	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+			"for crashkernel (System RAM: %ldMB)\n",
+			(unsigned long)(crash_size >> 20),
+			(unsigned long)(crash_base >> 20),
+			(unsigned long)(total_mem >> 20));
+
+	crashk_res.start = crash_base;
+	crashk_res.end   = crash_base + crash_size - 1;
+	insert_resource(&iomem_resource, &crashk_res);
+}
+#else
+static void __init reserve_crashkernel(void)
+{
+}
+#endif
+#endif /* CONFIG_XEN */
+
+static struct resource standard_io_resources[] = {
+	{ .name = "dma1", .start = 0x00, .end = 0x1f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic1", .start = 0x20, .end = 0x21,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer0", .start = 0x40, .end = 0x43,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "timer1", .start = 0x50, .end = 0x53,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x60, .end = 0x60,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "keyboard", .start = 0x64, .end = 0x64,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma page reg", .start = 0x80, .end = 0x8f,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "pic2", .start = 0xa0, .end = 0xa1,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "dma2", .start = 0xc0, .end = 0xdf,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO },
+	{ .name = "fpu", .start = 0xf0, .end = 0xff,
+		.flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+void __init reserve_standard_io_resources(void)
+{
+	int i;
+
+	/* Nothing to do if not running in dom0. */
+	if (!is_initial_xendomain())
+		return;
+
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
+
+static __init void reserve_ibft_region(void)
+{
+	unsigned long addr, size = 0;
+
+	addr = find_ibft_region(&size);
+
+#ifndef CONFIG_XEN
+	if (size)
+		memblock_reserve(addr, size);
+#endif
+}
+
+#ifndef CONFIG_XEN
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+
+static void __init trim_bios_range(void)
+{
+	/*
+	 * A special case is the first 4Kb of memory;
+	 * This is a BIOS owned area, not kernel ram, but generally
+	 * not listed as such in the E820 table.
+	 *
+	 * This typically reserves additional memory (64KiB by default)
+	 * since some BIOSes are known to corrupt low memory.  See the
+	 * Kconfig help text for X86_RESERVE_LOW.
+	 */
+	e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
+			  E820_RAM, E820_RESERVED);
+
+	/*
+	 * special case: Some BIOSen report the PC BIOS
+	 * area (640->1Mb) as ram even though it is not.
+	 * take them out.
+	 */
+	e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
+static int __init parse_reservelow(char *p)
+{
+	unsigned long long size;
+
+	if (!p)
+		return -EINVAL;
+
+	size = memparse(p, &p);
+
+	if (size < 4096)
+		size = 4096;
+
+	if (size > 640*1024)
+		size = 640*1024;
+
+	reserve_low = size;
+
+	return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+#endif
+
+/*
+ * Determine if we were loaded by an EFI loader.  If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization.  Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
+
+void __init setup_arch(char **cmdline_p)
+{
+#ifdef CONFIG_XEN
+	unsigned long p2m_pages;
+	struct physdev_set_iopl set_iopl;
+
+	if (!is_initial_xendomain()) {
+#ifdef CONFIG_X86_32
+		/* Force a quick death if the kernel panics (not domain 0). */
+		extern int panic_timeout;
+		if (!panic_timeout)
+			panic_timeout = 1;
+#endif
+
+		/* Register a call for panic conditions. */
+		atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+	}
+
+	set_iopl.iopl = 1;
+	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+#endif /* CONFIG_XEN */
+
+#ifdef CONFIG_X86_32
+	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+	visws_early_detect();
+
+#ifndef CONFIG_XEN
+	/*
+	 * copy kernel address range established so far and switch
+	 * to the proper swapper page table
+	 */
+	clone_pgd_range(swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+			initial_page_table + KERNEL_PGD_BOUNDARY,
+			KERNEL_PGD_PTRS);
+
+	load_cr3(swapper_pg_dir);
+	__flush_tlb_all();
+#endif
+#else
+	printk(KERN_INFO "Command line: %s\n", boot_command_line);
+#endif
+
+	/*
+	 * If we have OLPC OFW, we might end up relocating the fixmap due to
+	 * reserve_top(), so do this before touching the ioremap area.
+	 */
+	olpc_ofw_detect();
+
+	early_trap_init();
+	early_cpu_init();
+	early_ioremap_init();
+
+	setup_olpc_ofw_pgd();
+
+#ifndef CONFIG_XEN
+	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+	screen_info = boot_params.screen_info;
+	edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+	apm_info.bios = boot_params.apm_bios_info;
+	ist_info = boot_params.ist_info;
+	if (boot_params.sys_desc_table.length != 0) {
+		set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
+		machine_id = boot_params.sys_desc_table.table[0];
+		machine_submodel_id = boot_params.sys_desc_table.table[1];
+		BIOS_revision = boot_params.sys_desc_table.table[2];
+	}
+#endif
+	saved_video_mode = boot_params.hdr.vid_mode;
+	bootloader_type = boot_params.hdr.type_of_loader;
+	if ((bootloader_type >> 4) == 0xe) {
+		bootloader_type &= 0xf;
+		bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+	}
+	bootloader_version  = bootloader_type & 0xf;
+	bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
+
+#ifdef CONFIG_BLK_DEV_RAM
+	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+	rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+	rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+#ifdef CONFIG_EFI
+	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+		     EFI_LOADER_SIGNATURE, 4)) {
+		efi_enabled = 1;
+		efi_memblock_x86_reserve_range();
+	}
+#endif
+#else /* CONFIG_XEN */
+#ifdef CONFIG_X86_32
+	/* This must be initialized to UNNAMED_MAJOR for ipconfig to work
+	   properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
+	*/
+	ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
+#else
+ 	ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
+#endif
+	if (is_initial_xendomain()) {
+		const struct dom0_vga_console_info *info =
+			(void *)((char *)xen_start_info +
+			         xen_start_info->console.dom0.info_off);
+
+		dom0_init_screen_info(info,
+		                      xen_start_info->console.dom0.info_size);
+		xen_start_info->console.domU.mfn = 0;
+		xen_start_info->console.domU.evtchn = 0;
+
+		efi_probe();
+	} else
+		screen_info.orig_video_isVGA = 0;
+	copy_edid();
+#endif /* CONFIG_XEN */
+
+	x86_init.oem.arch_setup();
+
+	iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
+	setup_memory_map();
+	parse_setup_data();
+	/* update the e820_saved too */
+	e820_reserve_setup_data();
+
+	copy_edd();
+
+#ifndef CONFIG_XEN
+	if (!boot_params.hdr.root_flags)
+		root_mountflags &= ~MS_RDONLY;
+#endif
+	init_mm.start_code = (unsigned long) _text;
+	init_mm.end_code = (unsigned long) _etext;
+	init_mm.end_data = (unsigned long) _edata;
+	init_mm.brk = _brk_end;
+
+	code_resource.start = virt_to_phys(_text);
+	code_resource.end = virt_to_phys(_etext)-1;
+	data_resource.start = virt_to_phys(_etext);
+	data_resource.end = virt_to_phys(_edata)-1;
+	bss_resource.start = virt_to_phys(&__bss_start);
+	bss_resource.end = virt_to_phys(&__bss_stop)-1;
+
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+	strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+	if (builtin_cmdline[0]) {
+		/* append boot loader cmdline to builtin */
+		strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+		strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+		strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+	}
+#endif
+#endif
+
+	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+	*cmdline_p = command_line;
+
+	/*
+	 * x86_configure_nx() is called before parse_early_param() to detect
+	 * whether hardware doesn't support NX (so that the early EHCI debug
+	 * console setup can safely call set_fixmap()). It may then be called
+	 * again from within noexec_setup() during parsing early parameters
+	 * to honor the respective command line option.
+	 */
+	x86_configure_nx();
+
+	parse_early_param();
+
+	x86_report_nx();
+
+	/* after early param, so could get panic from serial */
+	memblock_x86_reserve_range_setup_data();
+
+	if (acpi_mps_check()) {
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+		disable_apic = 1;
+#endif
+		setup_clear_cpu_cap(X86_FEATURE_APIC);
+	}
+
+#ifdef CONFIG_PCI
+	if (pci_early_dump_regs)
+		early_dump_pci_devices();
+#endif
+
+	finish_e820_parsing();
+
+	if (efi_enabled)
+		efi_init();
+
+	if (is_initial_xendomain())
+		dmi_scan_machine();
+
+	/*
+	 * VMware detection requires dmi to be available, so this
+	 * needs to be done after dmi_scan_machine, for the BP.
+	 */
+	init_hypervisor_platform();
+
+	x86_init.resources.probe_roms();
+
+#ifndef CONFIG_XEN
+	/* after parse_early_param, so could debug it */
+	insert_resource(&iomem_resource, &code_resource);
+	insert_resource(&iomem_resource, &data_resource);
+	insert_resource(&iomem_resource, &bss_resource);
+
+	trim_bios_range();
+#ifdef CONFIG_X86_32
+	if (ppro_with_ram_bug()) {
+		e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+				  E820_RESERVED);
+		sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+		printk(KERN_INFO "fixed physical RAM map:\n");
+		e820_print_map("bad_ppro");
+	}
+#else
+	early_gart_iommu_check();
+#endif
+#endif /* CONFIG_XEN */
+
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	max_pfn = e820_end_of_ram_pfn();
+
+	/* update e820 for memory not covered by WB MTRRs */
+	mtrr_bp_init();
+#ifndef CONFIG_XEN
+	if (mtrr_trim_uncached_memory(max_pfn))
+		max_pfn = e820_end_of_ram_pfn();
+#endif
+
+#ifdef CONFIG_X86_32
+	/* max_low_pfn get updated here */
+	find_low_pfn_range();
+#else
+	num_physpages = max_pfn;
+	max_mapnr = max_pfn;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	check_x2apic();
+#endif
+
+	/* How many end-of-memory variables you have, grandma! */
+	/* need this before calling reserve_initrd */
+	if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+		max_low_pfn = e820_end_of_low_ram_pfn();
+	else
+		max_low_pfn = max_pfn;
+
+	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+	/*
+	 * Find and reserve possible boot-time SMP configuration:
+	 */
+	find_smp_config();
+
+	reserve_ibft_region();
+
+	/*
+	 * Need to conclude brk, before memblock_x86_fill()
+	 *  it could use memblock_find_in_range, could overlap with
+	 *  brk area.
+	 */
+	reserve_brk();
+
+	cleanup_highmap();
+
+	memblock.current_limit = get_max_mapped();
+	memblock_x86_fill();
+
+	/*
+	 * The EFI specification says that boot service code won't be called
+	 * after ExitBootServices(). This is, in fact, a lie.
+	 */
+	if (efi_enabled)
+		efi_reserve_boot_services();
+
+	/* preallocate 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+	setup_bios_corruption_check();
+#endif
+
+	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+			max_pfn_mapped<<PAGE_SHIFT);
+
+#ifndef CONFIG_XEN
+	setup_trampolines();
+#endif
+
+	init_gbpages();
+
+	/* max_pfn_mapped is updated here */
+#ifdef CONFIG_X86_64_XEN
+	if (xen_start_info->mfn_list < __START_KERNEL_map) {
+		/* Map P2M space only after all usable memory. */
+		unsigned long p2m_start = xen_start_info->first_p2m_pfn;
+		unsigned long p2m_end = p2m_start
+					+ xen_start_info->nr_p2m_frames;
+		unsigned long temp;
+
+		max_low_pfn_mapped = init_memory_mapping(
+			0, min(max_low_pfn, p2m_start) << PAGE_SHIFT);
+		max_pfn_mapped = max_low_pfn_mapped;
+
+		if (p2m_end < max_low_pfn)
+			max_low_pfn_mapped = init_memory_mapping(
+				p2m_end << PAGE_SHIFT,
+				max_low_pfn << PAGE_SHIFT);
+		max_pfn_mapped = max_low_pfn_mapped;
+
+		if (max_low_pfn < p2m_start)
+			max_pfn_mapped = init_memory_mapping(
+				max_low_pfn << PAGE_SHIFT,
+				p2m_start << PAGE_SHIFT);
+
+		if (max(max_low_pfn, p2m_end) < max_pfn)
+			max_pfn_mapped = init_memory_mapping(
+				max(max_low_pfn, p2m_end) << PAGE_SHIFT,
+				max_pfn << PAGE_SHIFT);
+
+		temp = max_pfn_mapped;
+		if (p2m_start < max_low_pfn) {
+			temp = init_memory_mapping(
+				p2m_start << PAGE_SHIFT,
+				min(max_low_pfn, p2m_end) << PAGE_SHIFT);
+			if (temp > max_low_pfn_mapped)
+				max_low_pfn_mapped = temp;
+		}
+
+		if (max_low_pfn < p2m_end)
+			temp = init_memory_mapping(
+				max(max_low_pfn, p2m_start) << PAGE_SHIFT,
+				p2m_end << PAGE_SHIFT);
+		if (temp > max_pfn_mapped)
+			max_pfn_mapped = temp;
+
+		goto init_memory_mapping_done;
+	}
+#endif
+
+	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
+	max_pfn_mapped = max_low_pfn_mapped;
+
+#ifdef CONFIG_X86_64
+	if (max_pfn > max_low_pfn) {
+		max_pfn_mapped = init_memory_mapping(1UL<<32,
+						     max_pfn<<PAGE_SHIFT);
+ init_memory_mapping_done:
+		/* can we preseve max_low_pfn ?*/
+		max_low_pfn = max_pfn;
+	}
+#endif
+	memblock.current_limit = get_max_mapped();
+
+	/*
+	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+	 */
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	if (init_ohci1394_dma_early)
+		init_ohci1394_dma_on_all_controllers();
+#endif
+	/* Allocate bigger log buffer */
+	setup_log_buf(1);
+
+	reserve_initrd();
+
+#ifndef CONFIG_XEN
+	reserve_crashkernel();
+
+	vsmp_init();
+#endif
+
+	io_delay_init();
+
+#ifdef CONFIG_ACPI
+	if (!is_initial_xendomain()) {
+		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+		disable_acpi();
+	}
+#endif
+
+	/*
+	 * Parse the ACPI tables for possible boot-time SMP configuration.
+	 */
+	acpi_boot_table_init();
+
+	early_acpi_boot_init();
+
+	initmem_init();
+	memblock_find_dma_reserve();
+
+#ifdef CONFIG_KVM_CLOCK
+	kvmclock_init();
+#endif
+
+	x86_init.paging.pagetable_setup_start(swapper_pg_dir);
+	paging_init();
+	x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+
+	if (boot_cpu_data.cpuid_level >= 0) {
+		/* A CPU has %cr4 if and only if it has CPUID */
+		mmu_cr4_features = read_cr4();
+	}
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+	/* sync back kernel address range */
+	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+			KERNEL_PGD_PTRS);
+#endif
+
+	tboot_probe();
+
+#ifdef CONFIG_X86_64
+	map_vsyscall();
+#endif
+
+#ifdef CONFIG_XEN
+#ifdef CONFIG_KEXEC
+	xen_machine_kexec_setup_resources();
+#endif
+	p2m_pages = max_pfn;
+	if (xen_start_info->nr_pages > max_pfn) {
+		/*
+		 * the max_pfn was shrunk (probably by mem= or highmem=
+		 * kernel parameter); shrink reservation with the HV
+		 */
+		struct xen_memory_reservation reservation = {
+			.address_bits = 0,
+			.extent_order = 0,
+			.domid = DOMID_SELF
+		};
+		unsigned int difference;
+		int ret;
+
+		difference = xen_start_info->nr_pages - max_pfn;
+
+		set_xen_guest_handle(reservation.extent_start,
+				     phys_to_machine_mapping + max_pfn);
+		reservation.nr_extents = difference;
+		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					   &reservation);
+		BUG_ON(ret != difference);
+	} else if (max_pfn > xen_start_info->nr_pages)
+		p2m_pages = xen_start_info->nr_pages;
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		/* Make sure we have a large enough P->M table. */
+		phys_to_machine_mapping = alloc_bootmem_pages(
+			max_pfn * sizeof(unsigned long));
+#ifdef CONFIG_X86_32
+		memcpy(phys_to_machine_mapping,
+		       (unsigned long *)xen_start_info->mfn_list,
+		       p2m_pages * sizeof(unsigned long));
+#else /* We must not use memcpy() for the full range here, as it's
+         not capable of dealing with 4Gb or more at a time. */
+		{
+			void *src = __va(__pa(xen_start_info->mfn_list));
+			unsigned long size, *dst = phys_to_machine_mapping;
+			unsigned int fpp = PAGE_SIZE / sizeof(*dst);
+
+			for (size = p2m_pages; size >= fpp; size -= fpp) {
+				copy_page(dst, src);
+				src += PAGE_SIZE;
+				dst += fpp;
+			}
+			memcpy(dst, src, size * sizeof(*dst));
+		}
+#endif
+		memset(phys_to_machine_mapping + p2m_pages, ~0,
+		       (max_pfn - p2m_pages) * sizeof(unsigned long));
+#ifdef CONFIG_X86_64
+		if (xen_start_info->mfn_list == VMEMMAP_START) {
+			/*
+			 * Since it is well isolated we can (and since it is
+			 * perhaps large we should) also free the page tables
+			 * mapping the initial P->M table.
+			 */
+			unsigned long va = VMEMMAP_START, pa;
+			pgd_t *pgd = pgd_offset_k(va);
+			pud_t *pud_page = pud_offset(pgd, 0);
+
+			BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
+			xen_l4_entry_update(pgd, __pgd(0));
+			for(;;) {
+				pud_t *pud = pud_page + pud_index(va);
+
+				if (pud_none(*pud))
+					va += PUD_SIZE;
+				else if (pud_large(*pud)) {
+					pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+					make_pages_writable(__va(pa),
+						PUD_SIZE >> PAGE_SHIFT,
+						XENFEAT_writable_page_tables);
+					free_bootmem(pa, PUD_SIZE);
+					va += PUD_SIZE;
+				} else {
+					pmd_t *pmd = pmd_offset(pud, va);
+
+					if (pmd_large(*pmd)) {
+						pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+						make_pages_writable(__va(pa),
+							PMD_SIZE >> PAGE_SHIFT,
+							XENFEAT_writable_page_tables);
+						free_bootmem(pa, PMD_SIZE);
+					} else if (!pmd_none(*pmd)) {
+						unsigned int i;
+						pte_t *pte = pte_offset_kernel(pmd, va);
+
+						for (i = 0; i < PTRS_PER_PTE; ++i) {
+							if (pte_none(pte[i]))
+								break;
+							pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+							make_page_writable(__va(pa),
+								XENFEAT_writable_page_tables);
+							free_bootmem(pa, PAGE_SIZE);
+						}
+						ClearPagePinned(virt_to_page(pte));
+						make_page_writable(pte,
+							XENFEAT_writable_page_tables);
+						free_bootmem(__pa(pte), PAGE_SIZE);
+					}
+					va += PMD_SIZE;
+					if (pmd_index(va))
+						continue;
+					ClearPagePinned(virt_to_page(pmd));
+					make_page_writable(pmd,
+						XENFEAT_writable_page_tables);
+					free_bootmem(__pa((unsigned long)pmd
+							  & PAGE_MASK),
+						PAGE_SIZE);
+				}
+				if (!pud_index(va))
+					break;
+			}
+			ClearPagePinned(virt_to_page(pud_page));
+			make_page_writable(pud_page,
+				XENFEAT_writable_page_tables);
+			free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
+				PAGE_SIZE);
+		} else if (!WARN_ON(xen_start_info->mfn_list
+				    < __START_KERNEL_map))
+#endif
+			free_bootmem(__pa(xen_start_info->mfn_list),
+				     PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+						     sizeof(unsigned long))));
+
+#ifndef CONFIG_KEXEC
+		if (!is_initial_xendomain())
+#endif
+			setup_pfn_to_mfn_frame_list(__alloc_bootmem);
+	}
+
+#ifdef CONFIG_ISA_DMA_API
+# define ch p2m_pages
+	/* Mark all ISA DMA channels in-use - using them wouldn't work. */
+	for (ch = 0; ch < MAX_DMA_CHANNELS; ++ch)
+		if (ch != 4 && request_dma(ch, "xen") != 0)
+			BUG();
+# undef ch
+#endif
+#else /* CONFIG_XEN */
+	generic_apic_probe();
+
+	early_quirks();
+#endif
+
+	/*
+	 * Read APIC and some other early information from ACPI tables.
+	 */
+	acpi_boot_init();
+	sfi_init();
+	x86_dtb_init();
+
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		get_smp_config();
+
+	prefill_possible_map();
+
+	init_cpu_to_node();
+
+#ifndef CONFIG_XEN
+	init_apic_mappings();
+	ioapic_and_gsi_init();
+
+	kvm_guest_init();
+
+	e820_reserve_resources();
+	e820_mark_nosave_regions(max_low_pfn);
+#else
+	if (is_initial_xendomain())
+		e820_reserve_resources();
+#endif
+
+	x86_init.resources.reserve_resources();
+
+#ifdef CONFIG_XEN
+	if (is_initial_xendomain())
+#endif
+		e820_setup_gap();
+
+#ifdef CONFIG_VT
+#ifdef CONFIG_DUMMY_CONSOLE
+	conswitchp = &dummy_con;
+#endif
+#ifdef CONFIG_VGA_CONSOLE
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		;
+	else
+#endif
+	if (!efi_enabled || efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)
+		conswitchp = &vga_con;
+#endif
+#endif
+	x86_init.oem.banner();
+
+	x86_init.timers.wallclock_init();
+
+	x86_platform.wallclock_init();
+
+	mcheck_init();
+
+	arch_init_ideal_nops();
+}
+
+#ifdef CONFIG_X86_32
+
+static struct resource video_ram_resource = {
+	.name	= "Video RAM area",
+	.start	= 0xa0000,
+	.end	= 0xbffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+void __init i386_reserve_resources(void)
+{
+	if (is_initial_xendomain())
+		request_resource(&iomem_resource, &video_ram_resource);
+	reserve_standard_io_resources();
+}
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_XEN
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	HYPERVISOR_shutdown(SHUTDOWN_crash);
+	/* we're never actually going to get here... */
+	return NOTIFY_DONE;
+}
+#endif /* !CONFIG_XEN */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 71f4727..d748199 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -219,6 +219,7 @@ void __init setup_per_cpu_areas(void)
 		 * are zeroed indicating that the static arrays are
 		 * gone.
 		 */
+#ifndef CONFIG_XEN
 #ifdef CONFIG_X86_LOCAL_APIC
 		per_cpu(x86_cpu_to_apicid, cpu) =
 			early_per_cpu_map(x86_cpu_to_apicid, cpu);
@@ -229,6 +230,7 @@ void __init setup_per_cpu_areas(void)
 		per_cpu(x86_cpu_to_logical_apicid, cpu) =
 			early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
 #endif
+#endif
 #ifdef CONFIG_X86_64
 		per_cpu(irq_stack_ptr, cpu) =
 			per_cpu(irq_stack_union.irq_stack, cpu) +
@@ -256,6 +258,7 @@ void __init setup_per_cpu_areas(void)
 	}
 
 	/* indicate the early static arrays will soon be gone */
+#ifndef CONFIG_XEN
 #ifdef CONFIG_X86_LOCAL_APIC
 	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
 	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
@@ -263,6 +266,7 @@ void __init setup_per_cpu_areas(void)
 #ifdef CONFIG_X86_32
 	early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
 #endif
+#endif
 #ifdef CONFIG_NUMA
 	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
 #endif
diff --git a/arch/x86/kernel/smp-xen.c b/arch/x86/kernel/smp-xen.c
new file mode 100644
index 0000000..1abab7b
--- /dev/null
+++ b/arch/x86/kernel/smp-xen.c
@@ -0,0 +1,237 @@
+/*
+ *	Intel SMP support routines.
+ *
+ *	(c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ *	(c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ *	i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
+ *
+ *	This code is released under the GNU General Public License version 2 or
+ *	later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/gfp.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/ipi.h>
+#include <asm/nmi.h>
+#include <xen/evtchn.h>
+/*
+ *	Some notes on x86 processor bugs affecting SMP operation:
+ *
+ *	Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ *	The Linux implications for SMP are handled as follows:
+ *
+ *	Pentium III / [Xeon]
+ *		None of the E1AP-E3AP errata are visible to the user.
+ *
+ *	E1AP.	see PII A1AP
+ *	E2AP.	see PII A2AP
+ *	E3AP.	see PII A3AP
+ *
+ *	Pentium II / [Xeon]
+ *		None of the A1AP-A3AP errata are visible to the user.
+ *
+ *	A1AP.	see PPro 1AP
+ *	A2AP.	see PPro 2AP
+ *	A3AP.	see PPro 7AP
+ *
+ *	Pentium Pro
+ *		None of 1AP-9AP errata are visible to the normal user,
+ *	except occasional delivery of 'spurious interrupt' as trap #15.
+ *	This is very rare and a non-problem.
+ *
+ *	1AP.	Linux maps APIC as non-cacheable
+ *	2AP.	worked around in hardware
+ *	3AP.	fixed in C0 and above steppings microcode update.
+ *		Linux does not use excessive STARTUP_IPIs.
+ *	4AP.	worked around in hardware
+ *	5AP.	symmetric IO mode (normal Linux operation) not affected.
+ *		'noapic' mode has vector 0xf filled out properly.
+ *	6AP.	'noapic' mode might be affected - fixed in later steppings
+ *	7AP.	We do not assume writes to the LVT deassering IRQs
+ *	8AP.	We do not enable low power mode (deep sleep) during MP bootup
+ *	9AP.	We do not use mixed mode
+ *
+ *	Pentium
+ *		There is a marginal case where REP MOVS on 100MHz SMP
+ *	machines with B stepping processors can fail. XXX should provide
+ *	an L1cache=Writethrough or L1cache=off option.
+ *
+ *		B stepping CPUs may hang. There are hardware work arounds
+ *	for this. We warn about it in case your board doesn't have the work
+ *	arounds. Basically that's so I can tell anyone with a B stepping
+ *	CPU and SMP problems "tough".
+ *
+ *	Specific items [From Pentium Processor Specification Update]
+ *
+ *	1AP.	Linux doesn't use remote read
+ *	2AP.	Linux doesn't trust APIC errors
+ *	3AP.	We work around this
+ *	4AP.	Linux never generated 3 interrupts of the same priority
+ *		to cause a lost local interrupt.
+ *	5AP.	Remote read is never used
+ *	6AP.	not affected - worked around in hardware
+ *	7AP.	not affected - worked around in hardware
+ *	8AP.	worked around in hardware - we get explicit CS errors if not
+ *	9AP.	only 'noapic' mode affected. Might generate spurious
+ *		interrupts, we log only the first one and count the
+ *		rest silently.
+ *	10AP.	not affected - worked around in hardware
+ *	11AP.	Linux reads the APIC between writes to avoid this, as per
+ *		the documentation. Make sure you preserve this as it affects
+ *		the C stepping chips too.
+ *	12AP.	not affected - worked around in hardware
+ *	13AP.	not affected - worked around in hardware
+ *	14AP.	we always deassert INIT during bootup
+ *	15AP.	not affected - worked around in hardware
+ *	16AP.	not affected - worked around in hardware
+ *	17AP.	not affected - worked around in hardware
+ *	18AP.	not affected - worked around in hardware
+ *	19AP.	not affected - worked around in BIOS
+ *
+ *	If this sounds worrying believe me these bugs are either ___RARE___,
+ *	or are signal timing bugs worked around in hardware and there's
+ *	about nothing of note with C stepping upwards.
+ */
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+void xen_smp_send_reschedule(int cpu)
+{
+	if (unlikely(cpu_is_offline(cpu))) {
+		WARN_ON(1);
+		return;
+	}
+	xen_send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+}
+
+void xen_send_call_func_single_ipi(int cpu)
+{
+	xen_send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR);
+}
+
+void xen_send_call_func_ipi(const struct cpumask *mask)
+{
+	xen_send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
+}
+
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool __read_mostly xen_smp_disable_nmi_ipi;
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+	/* We are registered on stopping cpu too, avoid spurious NMI */
+	if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+		return NMI_HANDLED;
+
+	stop_this_cpu(NULL);
+
+	return NMI_HANDLED;
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+void smp_reboot_interrupt(struct pt_regs *regs)
+{
+	stop_this_cpu(NULL);
+}
+
+void xen_stop_other_cpus(int wait)
+{
+	unsigned long flags;
+	unsigned long timeout;
+
+	/*
+	 * Use an own vector here because smp_call_function
+	 * does lots of things not suitable in a panic situation.
+	 * On most systems we could also use an NMI here,
+	 * but there are a few systems around where NMI
+	 * is problematic so stay with an non NMI for now
+	 * (this implies we cannot stop CPUs spinning with irq off
+	 * currently)
+	 */
+	if (num_online_cpus() > 1) {
+		unsigned int vector = REBOOT_VECTOR;
+
+		if (!xen_smp_disable_nmi_ipi) {
+			/* did someone beat us here? */
+			if (atomic_cmpxchg(&stopping_cpu, -1,
+					   safe_smp_processor_id()) != -1)
+				return;
+
+			if (register_nmi_handler(NMI_LOCAL,
+						 smp_stop_nmi_callback,
+						 NMI_FLAG_FIRST, "smp_stop"))
+				/* Note: we ignore failures here */
+				return;
+
+			/* sync above data before sending NMI */
+			wmb();
+
+			vector = NMI_VECTOR;
+		}
+
+		xen_send_IPI_allbutself(vector);
+
+		/*
+		 * Don't wait longer than a second if the caller
+		 * didn't ask us to wait.
+		 */
+		timeout = USEC_PER_SEC;
+		while (num_online_cpus() > 1 && (wait || timeout--))
+			udelay(1);
+	}
+
+	local_irq_save(flags);
+	disable_all_local_evtchn();
+	local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back.
+ */
+void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+	inc_irq_stat(irq_resched_count);
+	scheduler_ipi();
+}
+
+void smp_call_function_interrupt(struct pt_regs *regs)
+{
+	generic_smp_call_function_interrupt();
+	inc_irq_stat(irq_call_count);
+}
+
+void smp_call_function_single_interrupt(struct pt_regs *regs)
+{
+	generic_smp_call_function_single_interrupt();
+	inc_irq_stat(irq_call_count);
+}
+
+static int __init nonmi_ipi_setup(char *str)
+{
+        xen_smp_disable_nmi_ipi = true;
+        return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
diff --git a/arch/x86/kernel/syscall_32-xen.c b/arch/x86/kernel/syscall_32-xen.c
new file mode 100644
index 0000000..6e89a5a
--- /dev/null
+++ b/arch/x86/kernel/syscall_32-xen.c
@@ -0,0 +1,20 @@
+#include "syscall_32.c"
+
+#include <linux/thread_info.h>
+
+#ifdef TIF_CSTAR
+extern asmlinkage void cstar_set_tif(void);
+
+#define	ptregs_fork cstar_set_tif
+#define	ptregs_clone cstar_set_tif
+#define	ptregs_vfork cstar_set_tif
+
+const sys_call_ptr_t cstar_call_table[__NR_syscall_max+1] = {
+	/*
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
+#endif /* TIF_CSTAR */
diff --git a/arch/x86/kernel/time-xen.c b/arch/x86/kernel/time-xen.c
new file mode 100644
index 0000000..2c14f03
--- /dev/null
+++ b/arch/x86/kernel/time-xen.c
@@ -0,0 +1,626 @@
+/*
+ *  Copyright (c) 1991,1992,1995  Linus Torvalds
+ *  Copyright (c) 1994  Alan Modra
+ *  Copyright (c) 1995  Markus Kuhn
+ *  Copyright (c) 1996  Ingo Molnar
+ *  Copyright (c) 1998  Andrea Arcangeli
+ *  Copyright (c) 2002,2006  Vojtech Pavlik
+ *  Copyright (c) 2003  Andi Kleen
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/export.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <linux/posix-timers.h>
+#include <linux/cpufreq.h>
+#include <linux/clocksource.h>
+
+#include <asm/vsyscall.h>
+#include <asm/delay.h>
+#include <asm/time.h>
+#include <asm/timer.h>
+
+#include <xen/clock.h>
+#include <xen/sysctl.h>
+#include <xen/interface/vcpu.h>
+
+#ifdef CONFIG_X86_64
+DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
+#endif
+
+#define XEN_SHIFT 22
+
+unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
+/* These are peridically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+	u32 tsc_to_nsec_mul;
+	u32 tsc_to_usec_mul;
+	int tsc_shift;
+	u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+static struct timespec shadow_tv;
+static u32 shadow_tv_version;
+
+static u64 jiffies_bias, system_time_bias;
+
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* Must be signed, as it's compared with s64 quantities which can be -ve. */
+#define NS_PER_TICK (1000000000LL/HZ)
+
+/* Does this guest OS track Xen time, or set its wall clock independently? */
+static int independent_wallclock = 0;
+static int __init __independent_wallclock(char *str)
+{
+	independent_wallclock = 1;
+	return 1;
+}
+__setup("independent_wallclock", __independent_wallclock);
+
+/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
+static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
+static int __init __permitted_clock_jitter(char *str)
+{
+	permitted_clock_jitter = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("permitted_clock_jitter=", __permitted_clock_jitter);
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#else
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#endif
+
+	return product;
+}
+
+static void init_cpu_khz(void)
+{
+	u64 __cpu_khz = 1000000ULL << 32;
+	struct vcpu_time_info *info = &vcpu_info(0)->time;
+	do_div(__cpu_khz, info->tsc_to_system_mul);
+	if (info->tsc_shift < 0)
+		cpu_khz = __cpu_khz << -info->tsc_shift;
+	else
+		cpu_khz = __cpu_khz >> info->tsc_shift;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+	u64 now, delta;
+	rdtscll(now);
+	delta = now - shadow->tsc_timestamp;
+	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+static inline u64 processed_system_time(u64 jiffies_64)
+{
+	return (jiffies_64 - jiffies_bias) * NS_PER_TICK + system_time_bias;
+}
+
+static void update_wallclock(bool local)
+{
+	static DEFINE_MUTEX(uwc_mutex);
+	shared_info_t *s = HYPERVISOR_shared_info;
+
+	mutex_lock(&uwc_mutex);
+
+	do {
+		shadow_tv_version = s->wc_version;
+		rmb();
+		shadow_tv.tv_sec  = s->wc_sec;
+		shadow_tv.tv_nsec = s->wc_nsec;
+		rmb();
+	} while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
+
+	if (local) {
+		u64 tmp = processed_system_time(get_jiffies_64());
+		long nsec = do_div(tmp, NSEC_PER_SEC);
+		struct timespec tv;
+
+		set_normalized_timespec(&tv, shadow_tv.tv_sec + tmp,
+					shadow_tv.tv_nsec + nsec);
+		do_settimeofday(&tv);
+	}
+
+	mutex_unlock(&uwc_mutex);
+}
+
+static void _update_wallclock(struct work_struct *unused)
+{
+	update_wallclock(true);
+}
+static DECLARE_WORK(update_wallclock_work, _update_wallclock);
+
+void xen_check_wallclock_update(void)
+{
+	if (shadow_tv_version != HYPERVISOR_shared_info->wc_version
+	    && !is_initial_xendomain() && !independent_wallclock
+	    && keventd_up())
+		schedule_work(&update_wallclock_work);
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static void get_time_values_from_xen(unsigned int cpu)
+{
+	struct vcpu_time_info   *src;
+	struct shadow_time_info *dst;
+	unsigned long flags;
+	u32 pre_version, post_version;
+
+	src = &vcpu_info(cpu)->time;
+	dst = &per_cpu(shadow_time, cpu);
+
+	local_irq_save(flags);
+
+	do {
+		pre_version = dst->version = src->version;
+		rmb();
+		dst->tsc_timestamp     = src->tsc_timestamp;
+		dst->system_timestamp  = src->system_time;
+		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+		dst->tsc_shift         = src->tsc_shift;
+		rmb();
+		post_version = src->version;
+	} while ((pre_version & 1) | (pre_version ^ post_version));
+
+	dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
+
+	local_irq_restore(flags);
+}
+
+static inline int time_values_up_to_date(void)
+{
+	rmb();
+	return percpu_read(shadow_time.version) == vcpu_info_read(time.version);
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+int xen_update_wallclock(const struct timespec *tv)
+{
+	struct timespec now;
+	s64 nsec;
+	struct shadow_time_info *shadow;
+	struct xen_platform_op op;
+
+	if (!is_initial_xendomain() || independent_wallclock)
+		return -EPERM;
+
+	shadow = &__get_cpu_var(shadow_time);
+
+	/*
+	 * Ensure we don't get blocked for a long time so that our time delta
+	 * overflows. If that were to happen then our shadow time values would
+	 * be stale, so we can retry with fresh ones.
+	 */
+	for (;;) {
+		nsec = tv->tv_nsec - get_nsec_offset(shadow);
+		if (time_values_up_to_date())
+			break;
+		get_time_values_from_xen(smp_processor_id());
+	}
+	set_normalized_timespec(&now, tv->tv_sec, nsec);
+
+	op.cmd = XENPF_settime;
+	op.u.settime.secs        = now.tv_sec;
+	op.u.settime.nsecs       = now.tv_nsec;
+	op.u.settime.system_time = shadow->system_timestamp;
+	WARN_ON(HYPERVISOR_platform_op(&op));
+	update_wallclock(false);
+
+	return 0;
+}
+
+static void sync_xen_wallclock(unsigned long dummy);
+static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
+static void sync_xen_wallclock(unsigned long dummy)
+{
+	struct timespec now, ignore;
+	struct xen_platform_op op;
+
+	BUG_ON(!is_initial_xendomain());
+	if (!ntp_synced() || independent_wallclock)
+		return;
+
+	get_xtime_and_monotonic_and_sleep_offset(&now, &ignore, &ignore);
+	set_normalized_timespec(&now, now.tv_sec, now.tv_nsec);
+
+	op.cmd = XENPF_settime;
+	op.u.settime.secs        = now.tv_sec;
+	op.u.settime.nsecs       = now.tv_nsec;
+	op.u.settime.system_time = processed_system_time(get_jiffies_64());
+	WARN_ON(HYPERVISOR_platform_op(&op));
+
+	update_wallclock(false);
+
+	/* Once per minute. */
+	mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
+}
+#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
+
+unsigned long long xen_local_clock(void)
+{
+	unsigned int cpu = get_cpu();
+	struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+	u64 time;
+	u32 local_time_version;
+
+	do {
+		local_time_version = shadow->version;
+		rdtsc_barrier();
+		time = shadow->system_timestamp + get_nsec_offset(shadow);
+		if (!time_values_up_to_date())
+			get_time_values_from_xen(cpu);
+		barrier();
+	} while (local_time_version != shadow->version);
+
+	put_cpu();
+
+	return time;
+}
+
+unsigned long xen_read_wallclock(void)
+{
+	const shared_info_t *s = HYPERVISOR_shared_info;
+	u32 version, sec, nsec;
+	u64 delta;
+
+	do {
+		version = s->wc_version;
+		rmb();
+		sec     = s->wc_sec;
+		nsec    = s->wc_nsec;
+		rmb();
+	} while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+	delta = xen_local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
+	do_div(delta, NSEC_PER_SEC);
+
+	return delta;
+}
+
+int xen_write_wallclock(unsigned long now)
+{
+	if (!is_initial_xendomain() || independent_wallclock)
+		return 0;
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+	mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
+#endif
+
+	return mach_set_rtc_mmss(now);
+}
+
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+	u64 state_time;
+	struct vcpu_runstate_info *state;
+
+	BUG_ON(preemptible());
+
+	state = &__get_cpu_var(runstate);
+
+	do {
+		state_time = get_64bit_local(&state->state_entry_time);
+		*res = *state;
+	} while (get_64bit_local(&state->state_entry_time) != state_time);
+
+	WARN_ON_ONCE(res->state != RUNSTATE_running);
+}
+
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long sched_clock(void)
+{
+	struct vcpu_runstate_info runstate;
+	cycle_t now;
+	u64 ret;
+	s64 offset;
+
+	/*
+	 * Ideally sched_clock should be called on a per-cpu basis
+	 * anyway, so preempt should already be disabled, but that's
+	 * not current practice at the moment.
+	 */
+	preempt_disable();
+
+	now = xen_local_clock();
+
+	get_runstate_snapshot(&runstate);
+
+	offset = now - runstate.state_entry_time;
+	if (offset < 0)
+		offset = 0;
+
+	ret = offset + runstate.time[RUNSTATE_running]
+	      + runstate.time[RUNSTATE_blocked];
+
+	preempt_enable();
+
+	return ret;
+}
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+	unsigned long pc = instruction_pointer(regs);
+
+	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+		return *(unsigned long *)(regs->bp + sizeof(long));
+#else
+		unsigned long *sp =
+			(unsigned long *)kernel_stack_pointer(regs);
+
+		/*
+		 * Return address is either directly at stack pointer
+		 * or above a saved flags. Eflags has bits 22-31 zero,
+		 * kernel addresses don't.
+		 */
+		if (sp[0] >> 22)
+			return sp[0];
+		if (sp[1] >> 22)
+			return sp[1];
+#endif
+	}
+
+	return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+void mark_tsc_unstable(char *reason)
+{
+#ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
+	tsc_unstable = 1;
+#endif
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static cycle_t cs_last;
+
+static cycle_t xen_clocksource_read(struct clocksource *cs)
+{
+#ifdef CONFIG_SMP
+	cycle_t last = get_64bit(&cs_last);
+	cycle_t ret = xen_local_clock();
+
+	if (unlikely((s64)(ret - last) < 0)) {
+		if (last - ret > permitted_clock_jitter
+		    && printk_ratelimit()) {
+			unsigned int cpu = get_cpu();
+			struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+
+			printk(KERN_WARNING "clocksource/%u: "
+			       "Time went backwards: "
+			       "ret=%Lx delta=%Ld shadow=%Lx offset=%Lx\n",
+			       cpu, ret, ret - last, shadow->system_timestamp,
+			       get_nsec_offset(shadow));
+			put_cpu();
+		}
+		return last;
+	}
+
+	for (;;) {
+		cycle_t cur = cmpxchg64(&cs_last, last, ret);
+
+		if (cur == last || (s64)(ret - cur) < 0)
+			return ret;
+		last = cur;
+	}
+#else
+	return xen_local_clock();
+#endif
+}
+
+/* No locking required. Interrupts are disabled on all CPUs. */
+static void xen_clocksource_resume(struct clocksource *cs)
+{
+	unsigned int cpu;
+
+	init_cpu_khz();
+
+	for_each_online_cpu(cpu)
+		get_time_values_from_xen(cpu);
+
+	jiffies_bias = get_jiffies_64();
+	system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
+
+	cs_last = xen_local_clock();
+}
+
+static struct clocksource clocksource_xen = {
+	.name			= "xen",
+	.rating			= 400,
+	.read			= xen_clocksource_read,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.mult			= 1 << XEN_SHIFT,		/* time directly in nanoseconds */
+	.shift			= XEN_SHIFT,
+	.flags			= CLOCK_SOURCE_IS_CONTINUOUS,
+	.resume			= xen_clocksource_resume,
+};
+
+void setup_runstate_area(unsigned int cpu)
+{
+	struct vcpu_register_runstate_memory_area area;
+	struct vcpu_runstate_info *rs = &per_cpu(runstate, cpu);
+	int rc;
+
+	set_xen_guest_handle(area.addr.h, rs);
+	rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+	if (rc) {
+		BUILD_BUG_ON(RUNSTATE_running);
+		memset(rs, 0, sizeof(*rs));
+		WARN_ON(rc != -ENOSYS);
+	}
+}
+
+static void __init _late_time_init(void)
+{
+	update_wallclock(false);
+	xen_clockevents_init();
+}
+
+void __init time_init(void)
+{
+	init_cpu_khz();
+	printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
+	       cpu_khz / 1000, cpu_khz % 1000);
+
+	setup_runstate_area(0);
+	get_time_values_from_xen(0);
+
+	jiffies_bias     = jiffies_64;
+	system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
+
+	clocksource_register_hz(&clocksource_xen, NSEC_PER_SEC);
+
+	use_tsc_delay();
+
+	/*
+	 * Cannot request_irq() until kmem is initialised, and cannot
+	 * do_settimeofday() (i.e. clock_was_set()) until interrupts are on.
+	 */
+	late_time_init = _late_time_init;
+}
+
+/* Convert jiffies to system time. */
+u64 jiffies_to_st(unsigned long j)
+{
+	u64 j64 = get_jiffies_64();
+	long delta = j - (unsigned long)j64;
+
+	if (delta < 1)
+		/* Triggers in some wrap-around cases, but that's okay:
+		 * we just end up with a shorter timeout. */
+		return processed_system_time(j64) + NS_PER_TICK;
+
+	if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0)
+		/* Very long timeout means there is no pending timer.
+		 * We indicate this to Xen by passing zero timeout. */
+		return 0;
+
+	return processed_system_time(j64) + delta * (u64)NS_PER_TICK;
+}
+EXPORT_SYMBOL(jiffies_to_st);
+
+#ifdef CONFIG_CPU_FREQ
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 
+				void *data)
+{
+	struct cpufreq_freqs *freq = data;
+	struct xen_platform_op op;
+
+	if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
+		return 0;
+
+	if (val == CPUFREQ_PRECHANGE)
+		return 0;
+
+	op.cmd = XENPF_change_freq;
+	op.u.change_freq.flags = 0;
+	op.u.change_freq.cpu = freq->cpu;
+	op.u.change_freq.freq = (u64)freq->new * 1000;
+	WARN_ON(HYPERVISOR_platform_op(&op));
+
+	return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+	.notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_time_setup(void)
+{
+	if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
+			CPUFREQ_TRANSITION_NOTIFIER)) {
+		printk(KERN_ERR "failed to set up cpufreq notifier\n");
+		return -ENODEV;
+	}
+	return 0;
+}
+
+core_initcall(cpufreq_time_setup);
+#endif
+
+/*
+ * /proc/sys/xen: This really belongs in another file. It can stay here for
+ * now however.
+ */
+static ctl_table xen_subtable[] = {
+	{
+		.procname	= "independent_wallclock",
+		.data		= &independent_wallclock,
+		.maxlen		= sizeof(independent_wallclock),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "permitted_clock_jitter",
+		.data		= &permitted_clock_jitter,
+		.maxlen		= sizeof(permitted_clock_jitter),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax
+	},
+	{ }
+};
+static ctl_table xen_table[] = {
+	{
+		.procname	= "xen",
+		.mode		= 0555,
+		.child		= xen_subtable
+	},
+	{ }
+};
+static int __init xen_sysctl_init(void)
+{
+	(void)register_sysctl_table(xen_table);
+	return 0;
+}
+__initcall(xen_sysctl_init);
diff --git a/arch/x86/kernel/traps-xen.c b/arch/x86/kernel/traps-xen.c
new file mode 100644
index 0000000..8514d9d
--- /dev/null
+++ b/arch/x86/kernel/traps-xen.c
@@ -0,0 +1,747 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * Handle hardware traps and faults.
+ */
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kgdb.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#ifdef CONFIG_EISA
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#endif
+
+#ifdef CONFIG_MCA
+#include <linux/mca.h>
+#endif
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <asm/kmemcheck.h>
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <linux/atomic.h>
+#include <asm/system.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/mce.h>
+
+#include <asm/mach_traps.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/x86_init.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#else
+#include <asm/processor-flags.h>
+#include <asm/setup.h>
+
+asmlinkage int system_call(void);
+
+/* Do we ignore FPU interrupts ? */
+char ignore_fpu_irq;
+
+#ifndef CONFIG_X86_NO_IDT
+/*
+ * The IDT has to be page-aligned to simplify the Pentium
+ * F0 0F bug workaround.
+ */
+gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
+#endif
+#endif
+
+#ifndef CONFIG_XEN
+DECLARE_BITMAP(used_vectors, NR_VECTORS);
+EXPORT_SYMBOL_GPL(used_vectors);
+#endif
+
+static inline void conditional_sti(struct pt_regs *regs)
+{
+	if (regs->flags & X86_EFLAGS_IF)
+		local_irq_enable();
+}
+
+static inline void preempt_conditional_sti(struct pt_regs *regs)
+{
+	inc_preempt_count();
+	if (regs->flags & X86_EFLAGS_IF)
+		local_irq_enable();
+}
+
+static inline void conditional_cli(struct pt_regs *regs)
+{
+	if (regs->flags & X86_EFLAGS_IF)
+		local_irq_disable();
+}
+
+static inline void preempt_conditional_cli(struct pt_regs *regs)
+{
+	if (regs->flags & X86_EFLAGS_IF)
+		local_irq_disable();
+	dec_preempt_count();
+}
+
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+	long error_code, siginfo_t *info)
+{
+	struct task_struct *tsk = current;
+
+#ifdef CONFIG_X86_32
+	if (regs->flags & X86_VM_MASK) {
+		/*
+		 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+		 * On nmi (interrupt 2), do_trap should not be called.
+		 */
+		if (trapnr < 6)
+			goto vm86_trap;
+		goto trap_signal;
+	}
+#endif
+
+	if (!user_mode(regs))
+		goto kernel_trap;
+
+#ifdef CONFIG_X86_32
+trap_signal:
+#endif
+	/*
+	 * We want error_code and trap_no set for userspace faults and
+	 * kernelspace faults which result in die(), but not
+	 * kernelspace faults which are fixed up.  die() gives the
+	 * process no chance to handle the signal and notice the
+	 * kernel fault information, so that won't result in polluting
+	 * the information about previously queued, but not yet
+	 * delivered, faults.  See also do_general_protection below.
+	 */
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = trapnr;
+
+#ifdef CONFIG_X86_64
+	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+	    printk_ratelimit()) {
+		printk(KERN_INFO
+		       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+		       tsk->comm, tsk->pid, str,
+		       regs->ip, regs->sp, error_code);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+#endif
+
+	if (info)
+		force_sig_info(signr, info, tsk);
+	else
+		force_sig(signr, tsk);
+	return;
+
+kernel_trap:
+	if (!fixup_exception(regs)) {
+		tsk->thread.error_code = error_code;
+		tsk->thread.trap_no = trapnr;
+		die(str, regs, error_code);
+	}
+	return;
+
+#ifdef CONFIG_X86_32
+vm86_trap:
+	if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
+						error_code, trapnr))
+		goto trap_signal;
+	return;
+#endif
+}
+
+#define DO_ERROR(trapnr, signr, str, name)				\
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
+{									\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+							== NOTIFY_STOP)	\
+		return;							\
+	conditional_sti(regs);						\
+	do_trap(trapnr, signr, str, regs, error_code, NULL);		\
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)		\
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
+{									\
+	siginfo_t info;							\
+	info.si_signo = signr;						\
+	info.si_errno = 0;						\
+	info.si_code = sicode;						\
+	info.si_addr = (void __user *)siaddr;				\
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
+							== NOTIFY_STOP)	\
+		return;							\
+	conditional_sti(regs);						\
+	do_trap(trapnr, signr, str, regs, error_code, &info);		\
+}
+
+DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+#ifdef CONFIG_X86_32
+DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+#endif
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+
+#ifdef CONFIG_X86_64
+/* Runs on IST stack */
+dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+{
+	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
+			12, SIGBUS) == NOTIFY_STOP)
+		return;
+	preempt_conditional_sti(regs);
+	do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+	preempt_conditional_cli(regs);
+}
+
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+{
+	static const char str[] = "double fault";
+	struct task_struct *tsk = current;
+
+	/* Return not checked because double check cannot be ignored */
+	notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 8;
+
+	/*
+	 * This is always a kernel trap and never fixable (and thus must
+	 * never return).
+	 */
+	for (;;)
+		die(str, regs, error_code);
+}
+#endif
+
+dotraplinkage void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
+{
+	struct task_struct *tsk;
+
+	conditional_sti(regs);
+
+#ifdef CONFIG_X86_32
+	if (regs->flags & X86_VM_MASK)
+		goto gp_in_vm86;
+#endif
+
+	tsk = current;
+	if (!user_mode(regs))
+		goto gp_in_kernel;
+
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;
+
+	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+			printk_ratelimit()) {
+		printk(KERN_INFO
+			"%s[%d] general protection ip:%lx sp:%lx error:%lx",
+			tsk->comm, task_pid_nr(tsk),
+			regs->ip, regs->sp, error_code);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+	force_sig(SIGSEGV, tsk);
+	return;
+
+#ifdef CONFIG_X86_32
+gp_in_vm86:
+	local_irq_enable();
+	handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+	return;
+#endif
+
+gp_in_kernel:
+	if (fixup_exception(regs))
+		return;
+
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 13;
+	if (notify_die(DIE_GPF, "general protection fault", regs,
+				error_code, 13, SIGSEGV) == NOTIFY_STOP)
+		return;
+	die("general protection fault", regs, error_code);
+}
+
+/* May run on IST stack. */
+dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
+{
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+			== NOTIFY_STOP)
+		return;
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+
+	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+			== NOTIFY_STOP)
+		return;
+
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
+	preempt_conditional_sti(regs);
+	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
+}
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+/*
+ * Help handler running on IST stack to switch back to user stack
+ * for scheduling or signal handling. The actual stack switch is done in
+ * entry.S
+ */
+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+	struct pt_regs *regs = eregs;
+	/* Did already sync */
+	if (eregs == (struct pt_regs *)eregs->sp)
+		;
+	/* Exception from user space */
+	else if (user_mode(eregs))
+		regs = task_pt_regs(current);
+	/*
+	 * Exception from kernel and interrupts are enabled. Move to
+	 * kernel process stack.
+	 */
+	else if (eregs->flags & X86_EFLAGS_IF)
+		regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
+	if (eregs != regs)
+		*regs = *eregs;
+	return regs;
+}
+#endif
+
+/*
+ * Our handling of the processor debug registers is non-trivial.
+ * We do not clear them on entry and exit from the kernel. Therefore
+ * it is possible to get a watchpoint trap here from inside the kernel.
+ * However, the code in ./ptrace.c has ensured that the user can
+ * only set watchpoints on userspace addresses. Therefore the in-kernel
+ * watchpoint trap can only occur in code which is reading/writing
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+ *
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+ * we clear it here.
+ *
+ * Being careful here means that we don't have to be as careful in a
+ * lot of more complicated places (task switching can be a bit lazy
+ * about restoring all the debug state, and ptrace doesn't have to
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ *
+ * May run on IST stack.
+ */
+dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
+{
+	struct task_struct *tsk = current;
+	int user_icebp = 0;
+	unsigned long dr6;
+	int si_code;
+
+	get_debugreg(dr6, 6);
+
+	/* Filter out all the reserved bits which are preset to 1 */
+	dr6 &= ~DR6_RESERVED;
+
+	/*
+	 * If dr6 has no reason to give us about the origin of this trap,
+	 * then it's very likely the result of an icebp/int01 trap.
+	 * User wants a sigtrap for that.
+	 */
+	if (!dr6 && user_mode(regs))
+		user_icebp = 1;
+
+	/* Catch kmemcheck conditions first of all! */
+	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
+		return;
+
+	/* DR6 may or may not be cleared by the CPU */
+	set_debugreg(0, 6);
+
+	/*
+	 * The processor cleared BTF, so don't mark that we need it set.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
+	/* Store the virtualized DR6 value */
+	tsk->thread.debugreg6 = dr6;
+
+	if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
+							SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
+
+	/* It's safe to allow irq's after DR6 has been saved */
+	preempt_conditional_sti(regs);
+
+	if (regs->flags & X86_VM_MASK) {
+		handle_vm86_trap((struct kernel_vm86_regs *) regs,
+				error_code, 1);
+		preempt_conditional_cli(regs);
+		debug_stack_usage_dec();
+		return;
+	}
+
+	/*
+	 * Single-stepping through system calls: ignore any exceptions in
+	 * kernel space, but re-enable TF when returning to user mode.
+	 *
+	 * We already checked v86 mode above, so we can check for kernel mode
+	 * by just checking the CPL of CS.
+	 */
+	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+		tsk->thread.debugreg6 &= ~DR_STEP;
+		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+		regs->flags &= ~X86_EFLAGS_TF;
+	}
+	si_code = get_si_code(tsk->thread.debugreg6);
+	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
+		send_sigtrap(tsk, regs, error_code, si_code);
+	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
+
+	return;
+}
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+void math_error(struct pt_regs *regs, int error_code, int trapnr)
+{
+	struct task_struct *task = current;
+	siginfo_t info;
+	unsigned short err;
+	char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
+		return;
+	conditional_sti(regs);
+
+	if (!user_mode_vm(regs))
+	{
+		if (!fixup_exception(regs)) {
+			task->thread.error_code = error_code;
+			task->thread.trap_no = trapnr;
+			die(str, regs, error_code);
+		}
+		return;
+	}
+
+	/*
+	 * Save the info for the exception handler and clear the error.
+	 */
+	save_init_fpu(task);
+	task->thread.trap_no = trapnr;
+	task->thread.error_code = error_code;
+	info.si_signo = SIGFPE;
+	info.si_errno = 0;
+	info.si_addr = (void __user *)regs->ip;
+	if (trapnr == 16) {
+		unsigned short cwd, swd;
+		/*
+		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+		 * status.  0x3f is the exception bits in these regs, 0x200 is the
+		 * C1 reg you need in case of a stack fault, 0x040 is the stack
+		 * fault bit.  We should only be taking one exception at a time,
+		 * so if this combination doesn't produce any single exception,
+		 * then we have a bad program that isn't synchronizing its FPU usage
+		 * and it will suffer the consequences since we won't be able to
+		 * fully reproduce the context of the exception
+		 */
+		cwd = get_fpu_cwd(task);
+		swd = get_fpu_swd(task);
+
+		err = swd & ~cwd;
+	} else {
+		/*
+		 * The SIMD FPU exceptions are handled a little differently, as there
+		 * is only a single status/control register.  Thus, to determine which
+		 * unmasked exception was caught we must mask the exception mask bits
+		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+		 */
+		unsigned short mxcsr = get_fpu_mxcsr(task);
+		err = ~(mxcsr >> 7) & mxcsr;
+	}
+
+	if (err & 0x001) {	/* Invalid op */
+		/*
+		 * swd & 0x240 == 0x040: Stack Underflow
+		 * swd & 0x240 == 0x240: Stack Overflow
+		 * User must clear the SF bit (0x40) if set
+		 */
+		info.si_code = FPE_FLTINV;
+	} else if (err & 0x004) { /* Divide by Zero */
+		info.si_code = FPE_FLTDIV;
+	} else if (err & 0x008) { /* Overflow */
+		info.si_code = FPE_FLTOVF;
+	} else if (err & 0x012) { /* Denormal, Underflow */
+		info.si_code = FPE_FLTUND;
+	} else if (err & 0x020) { /* Precision */
+		info.si_code = FPE_FLTRES;
+	} else {
+		/*
+		 * If we're using IRQ 13, or supposedly even some trap 16
+		 * implementations, it's possible we get a spurious trap...
+		 */
+		return;		/* Spurious trap, no error */
+	}
+	force_sig_info(SIGFPE, &info, task);
+}
+
+dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+#ifdef CONFIG_X86_32
+	ignore_fpu_irq = 1;
+#endif
+
+	math_error(regs, error_code, 16);
+}
+
+dotraplinkage void
+do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+	math_error(regs, error_code, 19);
+}
+
+#ifndef CONFIG_XEN
+dotraplinkage void
+do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+{
+	conditional_sti(regs);
+#if 0
+	/* No need to warn about this any longer. */
+	printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+#endif
+}
+
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+{
+}
+
+asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
+{
+}
+#endif /* CONFIG_XEN */
+
+/*
+ * 'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ *
+ * Must be called with kernel preemption disabled (eg with local
+ * local interrupts as in the case of do_device_not_available).
+ */
+void math_state_restore(void)
+{
+	struct task_struct *tsk = current;
+
+	if (!tsk_used_math(tsk)) {
+		local_irq_enable();
+		/*
+		 * does a slab alloc which can sleep
+		 */
+		if (init_fpu(tsk)) {
+			/*
+			 * ran out of memory!
+			 */
+			do_group_exit(SIGKILL);
+			return;
+		}
+		local_irq_disable();
+	}
+
+	xen_thread_fpu_begin(tsk, NULL);
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		__thread_fpu_end(tsk);
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
+
+	tsk->fpu_counter++;
+}
+EXPORT_SYMBOL_GPL(math_state_restore);
+
+dotraplinkage void __kprobes
+do_device_not_available(struct pt_regs *regs, long error_code)
+{
+#ifdef CONFIG_MATH_EMULATION
+	if (read_cr0() & X86_CR0_EM) {
+		struct math_emu_info info = { };
+
+		conditional_sti(regs);
+
+		info.regs = regs;
+		math_emulate(&info);
+		return;
+	}
+#endif
+	math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+	conditional_sti(regs);
+#endif
+}
+
+#ifdef CONFIG_X86_32
+dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+{
+	siginfo_t info;
+	local_irq_enable();
+
+	info.si_signo = SIGILL;
+	info.si_errno = 0;
+	info.si_code = ILL_BADSTK;
+	info.si_addr = NULL;
+	if (notify_die(DIE_TRAP, "iret exception",
+			regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+		return;
+	do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
+}
+#endif
+
+/*
+ * NB. All these are "trap gates" (i.e. events_mask isn't set) except
+ * for those that specify <dpl>|4 in the second field.
+ */
+#ifdef CONFIG_X86_32
+#define X 0
+#else
+#define X 4
+#endif
+static const trap_info_t __initconst early_trap_table[] = {
+	{  1, 0|4, __KERNEL_CS, (unsigned long)debug			},
+	{  3, 3|4, __KERNEL_CS, (unsigned long)int3			},
+	{ 14, 0|4, __KERNEL_CS, (unsigned long)page_fault		},
+	{  0, 0,	   0, 0						}
+};
+static const trap_info_t __cpuinitconst trap_table[] = {
+	{  0, 0|X, __KERNEL_CS, (unsigned long)divide_error		},
+	{  1, 0|4, __KERNEL_CS, (unsigned long)debug			},
+	{  3, 3|4, __KERNEL_CS, (unsigned long)int3			},
+	{  4, 3|X, __KERNEL_CS, (unsigned long)overflow			},
+	{  5, 0|X, __KERNEL_CS, (unsigned long)bounds			},
+	{  6, 0|X, __KERNEL_CS, (unsigned long)invalid_op		},
+	{  7, 0|4, __KERNEL_CS, (unsigned long)device_not_available	},
+	{  9, 0|X, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
+	{ 10, 0|X, __KERNEL_CS, (unsigned long)invalid_TSS		},
+	{ 11, 0|X, __KERNEL_CS, (unsigned long)segment_not_present	},
+	{ 12, 0|X, __KERNEL_CS, (unsigned long)stack_segment		},
+	{ 13, 0|X, __KERNEL_CS, (unsigned long)general_protection	},
+	{ 14, 0|4, __KERNEL_CS, (unsigned long)page_fault		},
+	{ 16, 0|X, __KERNEL_CS, (unsigned long)coprocessor_error	},
+	{ 17, 0|X, __KERNEL_CS, (unsigned long)alignment_check		},
+#ifdef CONFIG_X86_MCE
+	{ 18, 0|X, __KERNEL_CS, (unsigned long)machine_check		},
+#endif
+	{ 19, 0|X, __KERNEL_CS, (unsigned long)simd_coprocessor_error	},
+#ifdef CONFIG_X86_32
+	{ 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment		},
+	{ SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call	},
+#elif defined(CONFIG_IA32_EMULATION)
+	{ IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall },
+#endif
+	{  0, 0,	   0, 0						}
+};
+
+/* Set of traps needed for early debugging. */
+void __init early_trap_init(void)
+{
+	int ret;
+
+	ret = HYPERVISOR_set_trap_table(early_trap_table);
+	if (ret)
+		printk("early set_trap_table failed (%d)\n", ret);
+}
+
+void __init trap_init(void)
+{
+	int ret;
+
+	ret = HYPERVISOR_set_trap_table(trap_table);
+	if (ret)
+		printk("HYPERVISOR_set_trap_table failed (%d)\n", ret);
+
+	/*
+	 * Should be a barrier for any external CPU state:
+	 */
+	cpu_init();
+
+	x86_init.irqs.trap_init();
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_IDT)
+	memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
+	set_nmi_gate(1, &debug);
+	set_nmi_gate(3, &int3);
+#endif
+}
+
+void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
+{
+	const trap_info_t *t = trap_table;
+
+	for (t = trap_table; t->address; t++) {
+		trap_ctxt[t->vector].flags = t->flags;
+		trap_ctxt[t->vector].cs = t->cs;
+		trap_ctxt[t->vector].address = t->address;
+	}
+	TI_SET_IF(trap_ctxt + NMI_VECTOR, 1);
+	trap_ctxt[NMI_VECTOR].cs = __KERNEL_CS;
+	trap_ctxt[NMI_VECTOR].address = (unsigned long)nmi;
+}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index b466cab..754175e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -125,7 +125,9 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
 
 struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 {
+#ifndef CONFIG_X86_NO_TSS
 	struct tss_struct *tss;
+#endif
 	struct pt_regs *ret;
 	unsigned long tmp;
 
@@ -148,12 +150,16 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 		do_exit(SIGSEGV);
 	}
 
+#ifndef CONFIG_X86_NO_TSS
 	tss = &per_cpu(init_tss, get_cpu());
+#endif
 	current->thread.sp0 = current->thread.saved_sp0;
 	current->thread.sysenter_cs = __KERNEL_CS;
 	load_sp0(tss, &current->thread);
 	current->thread.saved_sp0 = 0;
+#ifndef CONFIG_X86_NO_TSS
 	put_cpu();
+#endif
 
 	ret = KVM86->regs32;
 
@@ -280,7 +286,9 @@ out:
 
 static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
 {
+#ifndef CONFIG_X86_NO_TSS
 	struct tss_struct *tss;
+#endif
 /*
  * make sure the vm86() system call doesn't try to do anything silly
  */
@@ -324,12 +332,16 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	tsk->thread.saved_fs = info->regs32->fs;
 	tsk->thread.saved_gs = get_user_gs(info->regs32);
 
+#ifndef CONFIG_X86_NO_TSS
 	tss = &per_cpu(init_tss, get_cpu());
+#endif
 	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
 	load_sp0(tss, &tsk->thread);
+#ifndef CONFIG_X86_NO_TSS
 	put_cpu();
+#endif
 
 	tsk->thread.screen_bitmap = info->screen_bitmap;
 	if (info->flags & VM86_SCREEN_BITMAP)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 7c2ffb1..69c4a2c 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -16,8 +16,10 @@
 
 #ifdef CONFIG_X86_32
 #define LOAD_OFFSET __PAGE_OFFSET
-#else
+#elif !defined(CONFIG_XEN) || CONFIG_XEN_COMPAT > 0x030002
 #define LOAD_OFFSET __START_KERNEL_map
+#else
+#define LOAD_OFFSET 0
 #endif
 
 #include <asm-generic/vmlinux.lds.h>
@@ -41,7 +43,7 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && !defined(CONFIG_XEN)
 /*
  * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
  * we retain large page mappings for boundaries spanning kernel text, rodata
@@ -83,6 +85,10 @@ SECTIONS
 {
 #ifdef CONFIG_X86_32
         . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+#if defined(CONFIG_XEN) && CONFIG_XEN_COMPAT <= 0x030002
+#undef LOAD_OFFSET
+#define LOAD_OFFSET 0
+#endif
         phys_startup_32 = startup_32 - LOAD_OFFSET;
 #else
         . = __START_KERNEL;
diff --git a/arch/x86/kernel/vsyscall_64-xen.c b/arch/x86/kernel/vsyscall_64-xen.c
new file mode 100644
index 0000000..911bc0f
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_64-xen.c
@@ -0,0 +1,363 @@
+/*
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
+ *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
+ *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
+ *  jumping out of line if necessary. We cannot add more with this
+ *  mechanism because older kernels won't return -ENOSYS.
+ *
+ *  Note: the concept clashes with user mode linux.  UML users should
+ *  use the vDSO.
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/topology.h>
+#include <linux/clocksource.h>
+#include <linux/getcpu.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/compat.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+#include <asm/vgtod.h>
+#include <asm/traps.h>
+
+#define CREATE_TRACE_POINTS
+#include "vsyscall_trace.h"
+
+DEFINE_VVAR(int, vgetcpu_mode);
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
+{
+	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
+};
+
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
+
+static int __init vsyscall_setup(char *str)
+{
+	if (str) {
+		if (!strcmp("emulate", str))
+			vsyscall_mode = EMULATE;
+		else if (!strcmp("native", str))
+			vsyscall_mode = NATIVE;
+		else if (!strcmp("none", str))
+			vsyscall_mode = NONE;
+		else
+			return -EINVAL;
+
+		return 0;
+	}
+
+	return -EINVAL;
+}
+early_param("vsyscall", vsyscall_setup);
+
+void update_vsyscall_tz(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+	/* sys_tz has changed */
+	vsyscall_gtod_data.sys_tz = sys_tz;
+	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+			struct clocksource *clock, u32 mult)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+
+	/* copy vsyscall data */
+#ifndef CONFIG_XEN
+	vsyscall_gtod_data.clock.vclock_mode	= clock->archdata.vclock_mode;
+#endif
+	vsyscall_gtod_data.clock.cycle_last	= clock->cycle_last;
+	vsyscall_gtod_data.clock.mask		= clock->mask;
+	vsyscall_gtod_data.clock.mult		= mult;
+	vsyscall_gtod_data.clock.shift		= clock->shift;
+	vsyscall_gtod_data.wall_time_sec	= wall_time->tv_sec;
+	vsyscall_gtod_data.wall_time_nsec	= wall_time->tv_nsec;
+	vsyscall_gtod_data.wall_to_monotonic	= *wtm;
+	vsyscall_gtod_data.wall_time_coarse	= __current_kernel_time();
+
+	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+			      const char *message)
+{
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+	struct task_struct *tsk;
+
+	if (!show_unhandled_signals || !__ratelimit(&rs))
+		return;
+
+	tsk = current;
+
+	printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+	       level, tsk->comm, task_pid_nr(tsk),
+	       message, regs->ip, regs->cs,
+	       regs->sp, regs->ax, regs->si, regs->di);
+}
+
+static int addr_to_vsyscall_nr(unsigned long addr)
+{
+	int nr;
+
+	if ((addr & ~0xC00UL) != VSYSCALL_START)
+		return -EINVAL;
+
+	nr = (addr & 0xC00UL) >> 10;
+	if (nr >= 3)
+		return -EINVAL;
+
+	return nr;
+}
+
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+	/*
+	 * XXX: if access_ok, get_user, and put_user handled
+	 * sig_on_uaccess_error, this could go away.
+	 */
+
+	if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+		siginfo_t info;
+		struct thread_struct *thread = &current->thread;
+
+		thread->error_code	= 6;  /* user fault, no page, write */
+		thread->cr2		= ptr;
+		thread->trap_no		= 14;
+
+		memset(&info, 0, sizeof(info));
+		info.si_signo		= SIGSEGV;
+		info.si_errno		= 0;
+		info.si_code		= SEGV_MAPERR;
+		info.si_addr		= (void __user *)ptr;
+
+		force_sig_info(SIGSEGV, &info, current);
+		return false;
+	} else {
+		return true;
+	}
+}
+
+bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
+{
+	struct task_struct *tsk;
+	unsigned long caller;
+	int vsyscall_nr;
+	int prev_sig_on_uaccess_error;
+	long ret;
+
+	/*
+	 * No point in checking CS -- the only way to get here is a user mode
+	 * trap to a high address, which means that we're in 64-bit user code.
+	 */
+
+	WARN_ON_ONCE(address != regs->ip);
+
+	if (vsyscall_mode == NONE) {
+		warn_bad_vsyscall(KERN_INFO, regs,
+				  "vsyscall attempted with vsyscall=none");
+		return false;
+	}
+
+	vsyscall_nr = addr_to_vsyscall_nr(address);
+
+	trace_emulate_vsyscall(vsyscall_nr);
+
+	if (vsyscall_nr < 0) {
+		warn_bad_vsyscall(KERN_WARNING, regs,
+				  "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
+		goto sigsegv;
+	}
+
+	if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+		warn_bad_vsyscall(KERN_WARNING, regs,
+				  "vsyscall with bad stack (exploit attempt?)");
+		goto sigsegv;
+	}
+
+	tsk = current;
+	if (seccomp_mode(&tsk->seccomp))
+		do_exit(SIGKILL);
+
+	/*
+	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
+	 * preserve that behavior to make writing exploits harder.
+	 */
+	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+	current_thread_info()->sig_on_uaccess_error = 1;
+
+	/*
+	 * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
+	 * 64-bit, so we don't need to special-case it here.  For all the
+	 * vsyscalls, 0 means "don't write anything" not "write it at
+	 * address 0".
+	 */
+	ret = -EFAULT;
+	switch (vsyscall_nr) {
+	case 0:
+		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+		    !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+			break;
+
+		ret = sys_gettimeofday(
+			(struct timeval __user *)regs->di,
+			(struct timezone __user *)regs->si);
+		break;
+
+	case 1:
+		if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+			break;
+
+		ret = sys_time((time_t __user *)regs->di);
+		break;
+
+	case 2:
+		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+		    !write_ok_or_segv(regs->si, sizeof(unsigned)))
+			break;
+
+		ret = sys_getcpu((unsigned __user *)regs->di,
+				 (unsigned __user *)regs->si,
+				 0);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+
+	if (ret == -EFAULT) {
+		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
+		warn_bad_vsyscall(KERN_INFO, regs,
+				  "vsyscall fault (exploit attempt?)");
+
+		/*
+		 * If we failed to generate a signal for any reason,
+		 * generate one here.  (This should be impossible.)
+		 */
+		if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+				 !sigismember(&tsk->pending.signal, SIGSEGV)))
+			goto sigsegv;
+
+		return true;  /* Don't emulate the ret. */
+	}
+
+	regs->ax = ret;
+
+	/* Emulate a ret instruction. */
+	regs->ip = caller;
+	regs->sp += 8;
+
+	return true;
+
+sigsegv:
+	force_sig(SIGSEGV, current);
+	return true;
+}
+
+/*
+ * Assume __initcall executes before all user space. Hopefully kmod
+ * doesn't violate that. We'll find out if it does.
+ */
+static void __cpuinit vsyscall_set_cpu(int cpu)
+{
+	unsigned long d;
+	unsigned long node = 0;
+#ifdef CONFIG_NUMA
+	node = cpu_to_node(cpu);
+#endif
+	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
+		write_rdtscp_aux((node << 12) | cpu);
+
+	/*
+	 * Store cpu number in limit so that it can be loaded quickly
+	 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
+	 */
+	d = 0x0f40000000000ULL;
+	d |= cpu;
+	d |= (node & 0xf) << 12;
+	d |= (node >> 4) << 48;
+
+	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
+}
+
+static void __cpuinit cpu_vsyscall_init(void *arg)
+{
+	/* preemption should be already off */
+	vsyscall_set_cpu(raw_smp_processor_id());
+}
+
+static int __cpuinit
+cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
+{
+	long cpu = (long)arg;
+
+	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
+		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
+
+	return NOTIFY_DONE;
+}
+
+void __init map_vsyscall(void)
+{
+	extern char __vsyscall_page;
+	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
+	extern char __vvar_page;
+	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
+		     vsyscall_mode == NATIVE
+		     ? PAGE_KERNEL_VSYSCALL
+		     : PAGE_KERNEL_VVAR);
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
+		     (unsigned long)VSYSCALL_START);
+
+	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+		     (unsigned long)VVAR_ADDRESS);
+}
+
+static int __init vsyscall_init(void)
+{
+	BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
+
+	on_each_cpu(cpu_vsyscall_init, NULL, 1);
+	/* notifier priority > KVM */
+	hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
+	return 0;
+}
+__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 9796c2f..4d10890 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -55,6 +55,6 @@ EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(memmove);
 
 EXPORT_SYMBOL(empty_zero_page);
-#ifndef CONFIG_PARAVIRT
+#if !defined(CONFIG_PARAVIRT) && !defined(CONFIG_XEN)
 EXPORT_SYMBOL(native_load_gs_index);
 #endif
diff --git a/arch/x86/kernel/x86_init-xen.c b/arch/x86/kernel/x86_init-xen.c
new file mode 100644
index 0000000..78ba5fc
--- /dev/null
+++ b/arch/x86/kernel/x86_init-xen.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#include <linux/bitmap.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/spinlock_types.h>
+#include <linux/threads.h>
+
+#include <asm/pci_x86.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/pat.h>
+#include <asm/iommu.h>
+#include <asm/mach_traps.h>
+
+void __cpuinit x86_init_noop(void) { }
+void __init x86_init_uint_noop(unsigned int unused) { }
+void __init x86_init_pgd_noop(pgd_t *unused) { }
+int __init iommu_init_noop(void) { return 0; }
+void wallclock_init_noop(void) { }
+
+/*
+ * The platform setup functions are preset with the default functions
+ * for standard PC hardware.
+ */
+struct x86_init_ops x86_init __initdata = {
+
+	.resources = {
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+		.probe_roms		= probe_roms,
+#else
+		.probe_roms		= x86_init_noop,
+#endif
+		.reserve_resources	= reserve_standard_io_resources,
+		.memory_setup		= default_machine_specific_memory_setup,
+	},
+
+	.mpparse = {
+		.mpc_record		= x86_init_uint_noop,
+		.setup_ioapic_ids	= x86_init_noop,
+		.mpc_apic_id		= NULL,
+		.smp_read_mpc_oem	= default_smp_read_mpc_oem,
+		.mpc_oem_bus_info	= default_mpc_oem_bus_info,
+		.find_smp_config	= default_find_smp_config,
+		.get_smp_config		= default_get_smp_config,
+	},
+
+	.irqs = {
+		.pre_vector_init	= NULL,
+		.intr_init		= NULL,
+		.trap_init		= x86_init_noop,
+	},
+
+	.oem = {
+		.arch_setup		= xen_arch_setup,
+		.banner			= x86_init_noop,
+	},
+
+	.mapping = {
+		.pagetable_reserve		= xen_pagetable_reserve,
+	},
+
+	.paging = {
+		.pagetable_setup_start	= x86_init_pgd_noop,
+		.pagetable_setup_done	= x86_init_pgd_noop,
+	},
+
+	.timers = {
+		.setup_percpu_clockev	= NULL,
+		.tsc_pre_init		= x86_init_noop,
+		.timer_init		= x86_init_noop,
+		.wallclock_init		= x86_init_noop,
+	},
+
+	.iommu = {
+		.iommu_init		= iommu_init_noop,
+	},
+
+	.pci = {
+		.init			= x86_default_pci_init,
+		.init_irq		= x86_default_pci_init_irq,
+		.fixup_irqs		= x86_default_pci_fixup_irqs,
+	},
+};
+
+static int default_i8042_detect(void) { return 1; };
+
+struct x86_platform_ops x86_platform = {
+	.calibrate_tsc			= NULL,
+	.wallclock_init			= wallclock_init_noop,
+	.get_wallclock			= xen_read_wallclock,
+	.set_wallclock			= xen_write_wallclock,
+	.is_untracked_pat_range		= is_ISA_range,
+	.get_nmi_reason			= xen_get_nmi_reason,
+	.i8042_detect			= default_i8042_detect
+};
+
+EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1a7fe86..18419a2 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -7,6 +7,7 @@ source "virt/kvm/Kconfig"
 menuconfig VIRTUALIZATION
 	bool "Virtualization"
 	depends on HAVE_KVM || X86
+	depends on !XEN
 	default y
 	---help---
 	  Say Y here to get to see options for using your Linux host to run other
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index b00f678..88b35c4 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -15,6 +15,7 @@ $(obj)/inat.o: $(obj)/inat-tables.c
 clean-files := inat-tables.c
 
 obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
+obj-$(CONFIG_XEN) += cache-smp.o
 
 lib-y := delay.o
 lib-y += thunk_$(BITS).o
@@ -45,3 +46,5 @@ else
         lib-y += copy_user_64.o copy_user_nocache_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
+
+lib-$(CONFIG_XEN_SCRUB_PAGES) += scrub.o
diff --git a/arch/x86/lib/cache-smp-xen.c b/arch/x86/lib/cache-smp-xen.c
new file mode 100644
index 0000000..48bfd37
--- /dev/null
+++ b/arch/x86/lib/cache-smp-xen.c
@@ -0,0 +1,27 @@
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <asm/hypervisor.h>
+
+static void __wbinvd(void *dummy)
+{
+	wbinvd();
+}
+
+#ifndef CONFIG_XEN
+void wbinvd_on_cpu(int cpu)
+{
+	smp_call_function_single(cpu, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_cpu);
+#endif
+
+int wbinvd_on_all_cpus(void)
+{
+	struct mmuext_op op = { .cmd = MMUEXT_FLUSH_CACHE_GLOBAL };
+
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) == 0)
+		return 0;
+	/* Best effort as fallback. */
+	return on_each_cpu(__wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/memset_64-xen.S b/arch/x86/lib/memset_64-xen.S
new file mode 100644
index 0000000..5e20b4e
--- /dev/null
+++ b/arch/x86/lib/memset_64-xen.S
@@ -0,0 +1,154 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
+ *
+ * rdi   destination
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
+ * rax   original destination
+ */
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemset_c:
+	movq %rdi,%r9
+	movq %rdx,%rcx
+	andl $7,%edx
+	shrq $3,%rcx
+	/* expand byte value  */
+	movzbl %sil,%esi
+	movabs $0x0101010101010101,%rax
+	imulq %rsi,%rax
+	rep stosq
+	movl %edx,%ecx
+	rep stosb
+	movq %r9,%rax
+	ret
+.Lmemset_e:
+	.previous
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi   destination
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
+ * rax   original destination
+ */
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemset_c_e:
+	movq %rdi,%r9
+	movb %sil,%al
+	movq %rdx,%rcx
+	rep stosb
+	movq %r9,%rax
+	ret
+.Lmemset_e_e:
+	.previous
+
+ENTRY(memset)
+ENTRY(__memset)
+	CFI_STARTPROC
+	movq %rdi,%r10
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	imulq  %rcx,%rax
+
+	/* align dst */
+	movl  %edi,%r9d
+	andl  $7,%r9d
+	jnz  .Lbad_alignment
+	CFI_REMEMBER_STATE
+.Lafter_bad_alignment:
+
+	movq  %rdx,%rcx
+	shrq  $6,%rcx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decq  %rcx
+	movq  %rax,(%rdi)
+	movq  %rax,8(%rdi)
+	movq  %rax,16(%rdi)
+	movq  %rax,24(%rdi)
+	movq  %rax,32(%rdi)
+	movq  %rax,40(%rdi)
+	movq  %rax,48(%rdi)
+	movq  %rax,56(%rdi)
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */
+	.p2align 4
+.Lhandle_tail:
+	movl	%edx,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	andl	$7,%edx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %edx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+
+.Lende:
+	movq	%r10,%rax
+	ret
+
+	CFI_RESTORE_STATE
+.Lbad_alignment:
+	cmpq $7,%rdx
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8
+	subq %r9,%r8
+	addq %r8,%rdi
+	subq %r8,%rdx
+	jmp .Lafter_bad_alignment
+.Lfinal:
+	CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
+
+	/* Some CPUs support enhanced REP MOVSB/STOSB feature.
+	 * It is recommended to use this when possible.
+	 *
+	 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
+	 * instructions.
+	 *
+	 * Otherwise, use original memset function.
+	 *
+	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
+         * feature to implement the right patch order.
+	 */
+	.section .altinstructions,"a"
+	altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+			     .Lfinal-memset,.Lmemset_e-.Lmemset_c
+	altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+			     .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
+	.previous
diff --git a/arch/x86/lib/scrub.c b/arch/x86/lib/scrub.c
new file mode 100644
index 0000000..f333ae8
--- /dev/null
+++ b/arch/x86/lib/scrub.c
@@ -0,0 +1,21 @@
+#include <asm/cpufeature.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+
+void scrub_pages(void *v, unsigned int count)
+{
+	if (likely(cpu_has_xmm2)) {
+		unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4);
+
+		for (; n--; v += sizeof(long) * 4)
+			asm("movnti %1,(%0)\n\t"
+			    "movnti %1,%c2(%0)\n\t"
+			    "movnti %1,2*%c2(%0)\n\t"
+			    "movnti %1,3*%c2(%0)\n\t"
+			    : : "r" (v), "r" (0L), "i" (sizeof(long))
+			    : "memory");
+		asm volatile("sfence" : : : "memory");
+	} else
+		for (; count--; v += PAGE_SIZE)
+			clear_page(v);
+}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 23d8e5f..f9bba2e 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -27,4 +27,7 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
+obj-$(CONFIG_XEN)		+= hypervisor.o
+disabled-obj-$(CONFIG_XEN)	:= gup.o tlb.o
+
 obj-$(CONFIG_MEMTEST)		+= memtest.o
diff --git a/arch/x86/mm/dump_pagetables-xen.c b/arch/x86/mm/dump_pagetables-xen.c
new file mode 100644
index 0000000..d352692
--- /dev/null
+++ b/arch/x86/mm/dump_pagetables-xen.c
@@ -0,0 +1,392 @@
+/*
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <xen/interface/xen.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * The dumper groups pagetable entries of the same type into one, and for
+ * that it needs to keep some state when walking, and flush this state
+ * when a "break" in the continuity is found.
+ */
+struct pg_state {
+	int level;
+	pgprot_t current_prot;
+	unsigned long start_address;
+	unsigned long current_address;
+	const struct addr_marker *marker;
+};
+
+struct addr_marker {
+	unsigned long start_address;
+	const char *name;
+};
+
+/* indices for address_markers; keep sync'd w/ address_markers below */
+enum address_markers_idx {
+	USER_SPACE_NR = 0,
+#ifdef CONFIG_X86_64
+	XEN_SPACE_NR,
+	LOW_KERNEL_NR,
+	VMALLOC_START_NR,
+	VMEMMAP_START_NR,
+	HIGH_KERNEL_NR,
+	MODULES_VADDR_NR,
+	MODULES_END_NR,
+#else
+	KERNEL_SPACE_NR,
+	VMALLOC_START_NR,
+	VMALLOC_END_NR,
+# ifdef CONFIG_HIGHMEM
+	PKMAP_BASE_NR,
+# endif
+	FIXADDR_START_NR,
+	XEN_SPACE_NR,
+#endif
+};
+
+/* Address space markers hints */
+static struct addr_marker address_markers[] = {
+	{ 0, "User Space" },
+#ifdef CONFIG_X86_64
+	{ HYPERVISOR_VIRT_START,      "Hypervisor Space" },
+	{ PAGE_OFFSET,                "Low Kernel Mapping" },
+	{ VMALLOC_START,              "vmalloc() Area" },
+	{ VMEMMAP_START,              "Vmemmap" },
+	{ __START_KERNEL_map,         "High Kernel Mapping" },
+	{ MODULES_VADDR,              "Modules" },
+	{ MODULES_END,                "End Modules" },
+#else
+	{ PAGE_OFFSET,                "Kernel Mapping" },
+	{ 0/* VMALLOC_START */,       "vmalloc() Area" },
+	{ 0/*VMALLOC_END*/,           "vmalloc() End" },
+# ifdef CONFIG_HIGHMEM
+	{ 0/*PKMAP_BASE*/,            "Persisent kmap() Area" },
+# endif
+	{ 0/*FIXADDR_START*/,         "Fixmap Area" },
+	{ 0/*HYPERVISOR_VIRT_START*/, "Hypervisor Space" },
+#endif
+	{ -1, NULL }		      /* End of list */
+};
+
+static inline bool hypervisor_space(unsigned long addr) {
+#ifdef CONFIG_X86_64
+	return addr >= HYPERVISOR_VIRT_START && addr < HYPERVISOR_VIRT_END;
+#else
+	return addr >= hypervisor_virt_start;
+#endif
+}
+
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+
+/*
+ * Print a readable form of a pgprot_t to the seq_file
+ */
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
+{
+	pgprotval_t pr = pgprot_val(prot);
+	static const char * const level_name[] =
+		{ "cr3", "pgd", "pud", "pmd", "pte" };
+
+	if (!pgprot_val(prot)) {
+		/* Not present */
+		seq_printf(m, "                          ");
+	} else {
+		if (pr & _PAGE_USER)
+			seq_printf(m, "USR ");
+		else
+			seq_printf(m, "    ");
+		if (pr & _PAGE_RW)
+			seq_printf(m, "RW ");
+		else
+			seq_printf(m, "ro ");
+		if (pr & _PAGE_PWT)
+			seq_printf(m, "PWT ");
+		else
+			seq_printf(m, "    ");
+		if (pr & _PAGE_PCD)
+			seq_printf(m, "PCD ");
+		else
+			seq_printf(m, "    ");
+
+		/* Bit 9 has a different meaning on level 3 vs 4 */
+		if (level <= 3) {
+			if (pr & _PAGE_PSE)
+				seq_printf(m, "PSE ");
+			else
+				seq_printf(m, "    ");
+		} else {
+			if (pr & _PAGE_PAT)
+				seq_printf(m, "pat ");
+			else
+				seq_printf(m, "    ");
+		}
+		if (pr & _PAGE_GLOBAL)
+			seq_printf(m, "GLB ");
+		else
+			seq_printf(m, "    ");
+		if (pr & _PAGE_NX)
+			seq_printf(m, "NX ");
+		else
+			seq_printf(m, "x  ");
+	}
+	seq_printf(m, "%s\n", level_name[level]);
+}
+
+/*
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
+ */
+static unsigned long normalize_addr(unsigned long u)
+{
+#ifdef CONFIG_X86_64
+	return (signed long)(u << 16) >> 16;
+#else
+	return u;
+#endif
+}
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(struct seq_file *m, struct pg_state *st,
+		      pgprot_t new_prot, int level)
+{
+	pgprotval_t prot, cur;
+	static const char units[] = "KMGTPE";
+
+	/*
+	 * If we have a "break" in the series, we need to flush the state that
+	 * we have now. "break" is either changing perms, levels or
+	 * address space marker.
+	 */
+	prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
+	cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
+
+	if (!st->level) {
+		/* First entry */
+		st->current_prot = new_prot;
+		st->level = level;
+		st->marker = address_markers;
+		seq_printf(m, "---[ %s ]---\n", st->marker->name);
+	} else if (prot != cur || level != st->level ||
+		   st->current_address >= st->marker[1].start_address) {
+		const char *unit = units;
+		unsigned long delta;
+		int width = sizeof(unsigned long) * 2;
+
+		/*
+		 * Now print the actual finished series
+		 */
+		seq_printf(m, "0x%0*lx-0x%0*lx   ",
+			   width, st->start_address,
+			   width, st->current_address);
+
+		delta = (st->current_address - st->start_address) >> 10;
+		while (!(delta & 1023) && unit[1]) {
+			delta >>= 10;
+			unit++;
+		}
+		seq_printf(m, "%9lu%c ", delta, *unit);
+		printk_prot(m, st->current_prot, st->level);
+
+		/*
+		 * We print markers for special areas of address space,
+		 * such as the start of vmalloc space etc.
+		 * This helps in the interpretation.
+		 */
+		if (st->current_address >= st->marker[1].start_address) {
+			st->marker++;
+			seq_printf(m, "---[ %s ]---\n", st->marker->name);
+		}
+
+		st->start_address = st->current_address;
+		st->current_prot = new_prot;
+		st->level = level;
+	}
+}
+
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
+							unsigned long P)
+{
+	int i;
+	pte_t *start;
+
+	start = (pte_t *) pmd_page_vaddr(addr);
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pgprot_t prot = pte_pgprot(*start);
+
+		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+		note_page(m, st, prot, 4);
+		start++;
+	}
+}
+
+#if PTRS_PER_PMD > 1
+
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
+							unsigned long P)
+{
+	int i;
+	pmd_t *start;
+
+	start = (pmd_t *) pud_page_vaddr(addr);
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
+		if (!hypervisor_space(st->current_address)
+		    && !pmd_none(*start)) {
+			pgprotval_t prot = __pmd_val(*start) & PTE_FLAGS_MASK;
+
+			if (pmd_large(*start) || !pmd_present(*start))
+				note_page(m, st, __pgprot(prot), 3);
+			else
+				walk_pte_level(m, st, *start,
+					       P + i * PMD_LEVEL_MULT);
+		} else
+			note_page(m, st, __pgprot(0), 3);
+		start++;
+	}
+}
+
+#else
+#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+							unsigned long P)
+{
+	int i;
+	pud_t *start;
+
+	start = (pud_t *) pgd_page_vaddr(addr);
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
+		if (!hypervisor_space(st->current_address)
+		    && !pud_none(*start)) {
+			pgprotval_t prot = __pud_val(*start) & PTE_FLAGS_MASK;
+
+			if (pud_large(*start) || !pud_present(*start))
+				note_page(m, st, __pgprot(prot), 2);
+			else
+				walk_pmd_level(m, st, *start,
+					       P + i * PUD_LEVEL_MULT);
+		} else
+			note_page(m, st, __pgprot(0), 2);
+
+		start++;
+	}
+}
+
+#else
+#define __pud_ma(x) ((pud_t){ __pgd_ma(x) })
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud_ma(__pgd_val(a)),p)
+#define pgd_large(a) pud_large(__pud_ma(__pgd_val(a)))
+#define pgd_none(a)  pud_none(__pud_ma(__pgd_val(a)))
+#endif
+
+static void walk_pgd_level(struct seq_file *m)
+{
+#ifdef CONFIG_X86_64
+	pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+	pgd_t *start = swapper_pg_dir;
+#endif
+	int i;
+	struct pg_state st;
+
+	memset(&st, 0, sizeof(st));
+
+	for (i = 0; i < PTRS_PER_PGD; i++) {
+		st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+		if (!pgd_none(*start)) {
+			pgprotval_t prot = __pgd_val(*start) & PTE_FLAGS_MASK;
+
+			if (pgd_large(*start) || !pgd_present(*start))
+				note_page(m, &st, __pgprot(prot), 1);
+			else
+				walk_pud_level(m, &st, *start,
+					       i * PGD_LEVEL_MULT);
+		} else
+			note_page(m, &st, __pgprot(0), 1);
+
+		start++;
+	}
+
+	/* Flush out the last page */
+	st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+	note_page(m, &st, __pgprot(0), 0);
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+	walk_pgd_level(m);
+	return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+	.open		= ptdump_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init pt_dump_init(void)
+{
+	struct dentry *pe;
+
+#ifdef CONFIG_X86_32
+	/* Not a compile-time constant on x86-32 */
+	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+	address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
+# endif
+	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+	address_markers[XEN_SPACE_NR].start_address = hypervisor_virt_start;
+#endif
+
+	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
+				 &ptdump_fops);
+	if (!pe)
+		return -ENOMEM;
+
+	return 0;
+}
+
+__initcall(pt_dump_init);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/fault-xen.c b/arch/x86/mm/fault-xen.c
new file mode 100644
index 0000000..a82abe1
--- /dev/null
+++ b/arch/x86/mm/fault-xen.c
@@ -0,0 +1,1241 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
+ *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
+ */
+#include <linux/magic.h>		/* STACK_END_MAGIC		*/
+#include <linux/sched.h>		/* test_thread_flag(), ...	*/
+#include <linux/kdebug.h>		/* oops_begin/end, ...		*/
+#include <linux/module.h>		/* search_exception_table	*/
+#include <linux/bootmem.h>		/* max_low_pfn			*/
+#include <linux/kprobes.h>		/* __kprobes, ...		*/
+#include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
+#include <linux/perf_event.h>		/* perf_sw_event		*/
+#include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
+#include <linux/prefetch.h>		/* prefetchw			*/
+
+#include <asm/traps.h>			/* dotraplinkage, ...		*/
+#include <asm/pgalloc.h>		/* pgd_*(), ...			*/
+#include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
+#include <asm/fixmap.h>			/* VSYSCALL_START		*/
+
+/*
+ * Page fault error code bits:
+ *
+ *   bit 0 ==	 0: no page found	1: protection fault
+ *   bit 1 ==	 0: read access		1: write access
+ *   bit 2 ==	 0: kernel-mode access	1: user-mode access
+ *   bit 3 ==				1: use of reserved bit detected
+ *   bit 4 ==				1: fault was an instruction fetch
+ */
+enum x86_pf_error_code {
+
+	PF_PROT		=		1 << 0,
+	PF_WRITE	=		1 << 1,
+	PF_USER		=		1 << 2,
+	PF_RSVD		=		1 << 3,
+	PF_INSTR	=		1 << 4,
+};
+
+/*
+ * Returns 0 if mmiotrace is disabled, or if the fault is not
+ * handled by mmiotrace:
+ */
+static inline int __kprobes
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+	if (unlikely(is_kmmio_active()))
+		if (kmmio_handler(regs, addr) == 1)
+			return -1;
+	return 0;
+}
+
+static inline int __kprobes notify_page_fault(struct pt_regs *regs)
+{
+	int ret = 0;
+
+	/* kprobe_running() needs smp_processor_id() */
+	if (kprobes_built_in() && !user_mode_vm(regs)) {
+		preempt_disable();
+		if (kprobe_running() && kprobe_fault_handler(regs, 14))
+			ret = 1;
+		preempt_enable();
+	}
+
+	return ret;
+}
+
+/*
+ * Prefetch quirks:
+ *
+ * 32-bit mode:
+ *
+ *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ *   Check that here and ignore it.
+ *
+ * 64-bit mode:
+ *
+ *   Sometimes the CPU reports invalid exceptions on prefetch.
+ *   Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner.
+ */
+static inline int
+check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
+		      unsigned char opcode, int *prefetch)
+{
+	unsigned char instr_hi = opcode & 0xf0;
+	unsigned char instr_lo = opcode & 0x0f;
+
+	switch (instr_hi) {
+	case 0x20:
+	case 0x30:
+		/*
+		 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+		 * In X86_64 long mode, the CPU will signal invalid
+		 * opcode if some of these prefixes are present so
+		 * X86_64 will never get here anyway
+		 */
+		return ((instr_lo & 7) == 0x6);
+#ifdef CONFIG_X86_64
+	case 0x40:
+		/*
+		 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+		 * Need to figure out under what instruction mode the
+		 * instruction was issued. Could check the LDT for lm,
+		 * but for now it's good enough to assume that long
+		 * mode only uses well known segments or kernel.
+		 */
+		return (!user_mode(regs) || user_64bit_mode(regs));
+#endif
+	case 0x60:
+		/* 0x64 thru 0x67 are valid prefixes in all modes. */
+		return (instr_lo & 0xC) == 0x4;
+	case 0xF0:
+		/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+		return !instr_lo || (instr_lo>>1) == 1;
+	case 0x00:
+		/* Prefetch instruction is 0x0F0D or 0x0F18 */
+		if (probe_kernel_address(instr, opcode))
+			return 0;
+
+		*prefetch = (instr_lo == 0xF) &&
+			(opcode == 0x0D || opcode == 0x18);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static int
+is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
+{
+	unsigned char *max_instr;
+	unsigned char *instr;
+	int prefetch = 0;
+
+	/*
+	 * If it was a exec (instruction fetch) fault on NX page, then
+	 * do not ignore the fault:
+	 */
+	if (error_code & PF_INSTR)
+		return 0;
+
+	instr = (void *)convert_ip_to_linear(current, regs);
+	max_instr = instr + 15;
+
+	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+		return 0;
+
+	while (instr < max_instr) {
+		unsigned char opcode;
+
+		if (probe_kernel_address(instr, opcode))
+			break;
+
+		instr++;
+
+		if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
+			break;
+	}
+	return prefetch;
+}
+
+static void
+force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+		     struct task_struct *tsk, int fault)
+{
+	unsigned lsb = 0;
+	siginfo_t info;
+
+	info.si_signo	= si_signo;
+	info.si_errno	= 0;
+	info.si_code	= si_code;
+	info.si_addr	= (void __user *)address;
+	if (fault & VM_FAULT_HWPOISON_LARGE)
+		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+	if (fault & VM_FAULT_HWPOISON)
+		lsb = PAGE_SHIFT;
+	info.si_addr_lsb = lsb;
+
+	force_sig_info(si_signo, &info, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+	unsigned index = pgd_index(address);
+	pgd_t *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
+
+	pgd += index;
+	pgd_k = init_mm.pgd + index;
+
+	if (!pgd_present(*pgd_k))
+		return NULL;
+
+	/*
+	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
+	 * and redundant with the set_pmd() on non-PAE. As would
+	 * set_pud.
+	 */
+	pud = pud_offset(pgd, address);
+	pud_k = pud_offset(pgd_k, address);
+	if (!pud_present(*pud_k))
+		return NULL;
+
+	pmd = pmd_offset(pud, address);
+	pmd_k = pmd_offset(pud_k, address);
+	if (!pmd_present(*pmd_k))
+		return NULL;
+
+	if (!pmd_present(*pmd))
+#if CONFIG_XEN_COMPAT > 0x030002
+		set_pmd(pmd, *pmd_k);
+#else
+		/*
+		 * When running on older Xen we must launder *pmd_k through
+		 * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
+		 */
+		set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
+#endif
+	else
+		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+
+	return pmd_k;
+}
+
+void vmalloc_sync_all(void)
+{
+	unsigned long address;
+
+	if (SHARED_KERNEL_PMD)
+		return;
+
+	for (address = VMALLOC_START & PMD_MASK;
+	     address >= TASK_SIZE && address < FIXADDR_TOP;
+	     address += PMD_SIZE) {
+		struct page *page;
+
+		spin_lock(&pgd_lock);
+		list_for_each_entry(page, &pgd_list, lru) {
+			spinlock_t *pgt_lock;
+			pmd_t *ret;
+
+			/* the pgt_lock only for Xen */
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
+				break;
+		}
+		spin_unlock(&pgd_lock);
+	}
+}
+
+/*
+ * 32-bit:
+ *
+ *   Handle a fault on the vmalloc or module mapping area
+ */
+static noinline __kprobes int vmalloc_fault(unsigned long address)
+{
+	unsigned long pgd_paddr;
+	pmd_t *pmd_k;
+	pte_t *pte_k;
+
+	/* Make sure we are in vmalloc area: */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
+	WARN_ON_ONCE(in_nmi());
+
+	/*
+	 * Synchronize this task's top level page-table
+	 * with the 'reference' page table.
+	 *
+	 * Do _not_ use "current" here. We might be inside
+	 * an interrupt in the middle of a task switch..
+	 */
+	pgd_paddr = read_cr3();
+	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+	if (!pmd_k)
+		return -1;
+
+	pte_k = pte_offset_kernel(pmd_k, address);
+	if (!pte_present(*pte_k))
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
+{
+	unsigned long bit;
+
+	if (!v8086_mode(regs))
+		return;
+
+	bit = (address - 0xA0000) >> PAGE_SHIFT;
+	if (bit < 32)
+		tsk->thread.screen_bitmap |= 1 << bit;
+}
+
+static bool low_pfn(unsigned long pfn)
+{
+	return pfn < max_low_pfn;
+}
+
+static void dump_pagetable(unsigned long address)
+{
+	pgd_t *base = __va(read_cr3());
+	pgd_t *pgd = &base[pgd_index(address)];
+	pmd_t *pmd;
+	pte_t *pte;
+
+#ifdef CONFIG_X86_PAE
+	printk("*pdpt = %016Lx ", __pgd_val(*pgd));
+	if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
+		goto out;
+#endif
+	pmd = pmd_offset(pud_offset(pgd, address), address);
+	printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)__pmd_val(*pmd));
+
+	/*
+	 * We must not directly access the pte in the highpte
+	 * case if the page table is located in highmem.
+	 * And let's rather not kmap-atomic the pte, just in case
+	 * it's allocated already:
+	 */
+	if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
+		goto out;
+
+	pte = pte_offset_kernel(pmd, address);
+	printk(KERN_CONT "*pte = %0*Lx ", sizeof(*pte) * 2, (u64)__pte_val(*pte));
+out:
+	printk(KERN_CONT "\n");
+}
+#define dump_pagetable(addr, krnl) dump_pagetable(addr)
+
+#else /* CONFIG_X86_64: */
+
+void vmalloc_sync_all(void)
+{
+	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
+}
+
+/*
+ * 64-bit:
+ *
+ *   Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static noinline __kprobes int vmalloc_fault(unsigned long address)
+{
+	pgd_t *pgd, *pgd_ref;
+	pud_t *pud, *pud_ref;
+	pmd_t *pmd, *pmd_ref;
+	pte_t *pte, *pte_ref;
+
+	/* Make sure we are in vmalloc area: */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
+	WARN_ON_ONCE(in_nmi());
+
+	/*
+	 * Copy kernel mappings over when needed. This can also
+	 * happen within a race in page table update. In the later
+	 * case just flush:
+	 */
+	pgd = pgd_offset(current->active_mm, address);
+	pgd_ref = pgd_offset_k(address);
+	if (pgd_none(*pgd_ref))
+		return -1;
+
+	if (pgd_none(*pgd))
+		set_pgd(pgd, *pgd_ref);
+	else
+		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+	/*
+	 * Below here mismatches are bugs because these lower tables
+	 * are shared:
+	 */
+
+	pud = pud_offset(pgd, address);
+	pud_ref = pud_offset(pgd_ref, address);
+	if (pud_none(*pud_ref))
+		return -1;
+
+	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+		BUG();
+
+	pmd = pmd_offset(pud, address);
+	pmd_ref = pmd_offset(pud_ref, address);
+	if (pmd_none(*pmd_ref))
+		return -1;
+
+	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+		BUG();
+
+	pte_ref = pte_offset_kernel(pmd_ref, address);
+	if (!pte_present(*pte_ref))
+		return -1;
+
+	pte = pte_offset_kernel(pmd, address);
+
+	/*
+	 * Don't use pte_page here, because the mappings can point
+	 * outside mem_map, and the NUMA hash lookup cannot handle
+	 * that:
+	 */
+	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+		BUG();
+
+	return 0;
+}
+
+#ifdef CONFIG_CPU_SUP_AMD
+static const char errata93_warning[] =
+KERN_ERR
+"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+"******* Working around it, but it may cause SEGVs or burn power.\n"
+"******* Please consider a BIOS update.\n"
+"******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
+{
+}
+
+static int bad_address(void *p)
+{
+	unsigned long dummy;
+
+	return probe_kernel_address((unsigned long *)p, dummy);
+}
+
+static void dump_pagetable(unsigned long address, bool kernel)
+{
+	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+	pgd_t *pgd = base + pgd_index(address);
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	if (!kernel)
+		pgd = __user_pgd(base) + pgd_index(address);
+
+	if (bad_address(pgd))
+		goto bad;
+
+	printk("PGD %lx ", pgd_val(*pgd));
+
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (bad_address(pud))
+		goto bad;
+
+	printk(KERN_CONT "PUD %lx ", pud_val(*pud));
+	if (!pud_present(*pud) || pud_large(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	if (bad_address(pmd))
+		goto bad;
+
+	printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
+	if (!pmd_present(*pmd) || pmd_large(*pmd))
+		goto out;
+
+	pte = pte_offset_kernel(pmd, address);
+	if (bad_address(pte))
+		goto bad;
+
+	printk(KERN_CONT "PTE %lx", pte_val(*pte));
+out:
+	printk(KERN_CONT "\n");
+	return;
+bad:
+	printk("BAD\n");
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Workaround for K8 erratum #93 & buggy BIOS.
+ *
+ * BIOS SMM functions are required to use a specific workaround
+ * to avoid corruption of the 64bit RIP register on C stepping K8.
+ *
+ * A lot of BIOS that didn't get tested properly miss this.
+ *
+ * The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ * Try to work around it here.
+ *
+ * Note we only handle faults in kernel here.
+ * Does nothing on 32-bit.
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+	    || boot_cpu_data.x86 != 0xf)
+		return 0;
+
+	if (address != regs->ip)
+		return 0;
+
+	if ((address >> 32) != 0)
+		return 0;
+
+	address |= 0xffffffffUL << 32;
+	if ((address >= (u64)_stext && address <= (u64)_etext) ||
+	    (address >= MODULES_VADDR && address <= MODULES_END)) {
+		printk_once(errata93_warning);
+		regs->ip = address;
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps
+ * to illegal addresses >4GB.
+ *
+ * We catch this in the page fault handler because these addresses
+ * are not reachable. Just detect this case and return.  Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+	if ((regs->cs == __USER32_CS || regs->cs == FLAT_USER_CS32 ||
+	     (regs->cs & (1<<2))) && (address >> 32))
+		return 1;
+#endif
+	return 0;
+}
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+	unsigned long nr;
+
+	/*
+	 * Pentium F0 0F C7 C8 bug workaround:
+	 */
+	if (boot_cpu_data.f00f_bug) {
+		nr = (address - idt_descr.address) >> 3;
+
+		if (nr == 6) {
+			do_invalid_op(regs, 0);
+			return 1;
+		}
+	}
+#endif
+	return 0;
+}
+
+static const char nx_warning[] = KERN_CRIT
+"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+
+static void
+show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+		unsigned long address)
+{
+	if (!oops_may_print())
+		return;
+
+	if (error_code & PF_INSTR) {
+		unsigned int level;
+
+		pte_t *pte = lookup_address(address, &level);
+
+		if (pte && pte_present(*pte) && !pte_exec(*pte))
+			printk(nx_warning, current_uid());
+	}
+
+	printk(KERN_ALERT "BUG: unable to handle kernel ");
+	if (address < PAGE_SIZE)
+		printk(KERN_CONT "NULL pointer dereference");
+	else
+		printk(KERN_CONT "paging request");
+
+	printk(KERN_CONT " at %p\n", (void *) address);
+	printk(KERN_ALERT "IP:");
+	printk_address(regs->ip, 1);
+
+	dump_pagetable(address, !(error_code & PF_USER));
+}
+
+static noinline void
+pgtable_bad(struct pt_regs *regs, unsigned long error_code,
+	    unsigned long address)
+{
+	struct task_struct *tsk;
+	unsigned long flags;
+	int sig;
+
+	flags = oops_begin();
+	tsk = current;
+	sig = SIGKILL;
+
+	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+	       tsk->comm, address);
+	dump_pagetable(address, !(error_code & PF_USER));
+
+	tsk->thread.cr2		= address;
+	tsk->thread.trap_no	= 14;
+	tsk->thread.error_code	= error_code;
+
+	if (__die("Bad pagetable", regs, error_code))
+		sig = 0;
+
+	oops_end(flags, regs, sig);
+}
+
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+	   unsigned long address, int signal, int si_code)
+{
+	struct task_struct *tsk = current;
+	unsigned long *stackend;
+	unsigned long flags;
+	int sig;
+
+	/* Are we prepared to handle this kernel fault? */
+	if (fixup_exception(regs)) {
+		if (current_thread_info()->sig_on_uaccess_error && signal) {
+			tsk->thread.trap_no = 14;
+			tsk->thread.error_code = error_code | PF_USER;
+			tsk->thread.cr2 = address;
+
+			/* XXX: hwpoison faults will set the wrong code. */
+			force_sig_info_fault(signal, si_code, address, tsk, 0);
+		}
+		return;
+	}
+
+	/*
+	 * 32-bit:
+	 *
+	 *   Valid to do another page fault here, because if this fault
+	 *   had been triggered by is_prefetch fixup_exception would have
+	 *   handled it.
+	 *
+	 * 64-bit:
+	 *
+	 *   Hall of shame of CPU/BIOS bugs.
+	 */
+	if (is_prefetch(regs, error_code, address))
+		return;
+
+	if (is_errata93(regs, address))
+		return;
+
+	/*
+	 * Oops. The kernel tried to access some bad page. We'll have to
+	 * terminate things with extreme prejudice:
+	 */
+	flags = oops_begin();
+
+	show_fault_oops(regs, error_code, address);
+
+	stackend = end_of_stack(tsk);
+	if (tsk != &init_task && *stackend != STACK_END_MAGIC)
+		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+	tsk->thread.cr2		= address;
+	tsk->thread.trap_no	= 14;
+	tsk->thread.error_code	= error_code;
+
+	sig = SIGKILL;
+	if (__die("Oops", regs, error_code))
+		sig = 0;
+
+	/* Executive summary in case the body of the oops scrolled away */
+	printk(KERN_DEFAULT "CR2: %016lx\n", address);
+
+	oops_end(flags, regs, sig);
+}
+
+/*
+ * Print out info about fatal segfaults, if the show_unhandled_signals
+ * sysctl is set:
+ */
+static inline void
+show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+		unsigned long address, struct task_struct *tsk)
+{
+	if (!unhandled_signal(tsk, SIGSEGV))
+		return;
+
+	if (!printk_ratelimit())
+		return;
+
+	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+		tsk->comm, task_pid_nr(tsk), address,
+		(void *)regs->ip, (void *)regs->sp, error_code);
+
+	print_vma_addr(KERN_CONT " in ", regs->ip);
+
+	printk(KERN_CONT "\n");
+}
+
+static void
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+		       unsigned long address, int si_code)
+{
+	struct task_struct *tsk = current;
+
+	/* User mode accesses just cause a SIGSEGV */
+	if (error_code & PF_USER) {
+		/*
+		 * It's possible to have interrupts off here:
+		 */
+		local_irq_enable();
+
+		/*
+		 * Valid to do another page fault here because this one came
+		 * from user space:
+		 */
+		if (is_prefetch(regs, error_code, address))
+			return;
+
+		if (is_errata100(regs, address))
+			return;
+
+#ifdef CONFIG_X86_64
+		/*
+		 * Instruction fetch faults in the vsyscall page might need
+		 * emulation.
+		 */
+		if (unlikely((error_code & PF_INSTR) &&
+			     ((address & ~0xfff) == VSYSCALL_START))) {
+			if (emulate_vsyscall(regs, address))
+				return;
+		}
+#endif
+
+		if (unlikely(show_unhandled_signals))
+			show_signal_msg(regs, error_code, address, tsk);
+
+		/* Kernel addresses are always protection faults: */
+		tsk->thread.cr2		= address;
+		tsk->thread.error_code	= error_code | (address >= TASK_SIZE);
+		tsk->thread.trap_no	= 14;
+
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+
+		return;
+	}
+
+	if (is_f00f_bug(regs, address))
+		return;
+
+	no_context(regs, error_code, address, SIGSEGV, si_code);
+}
+
+static noinline void
+bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+		     unsigned long address)
+{
+	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void
+__bad_area(struct pt_regs *regs, unsigned long error_code,
+	   unsigned long address, int si_code)
+{
+	struct mm_struct *mm = current->mm;
+
+	/*
+	 * Something tried to access memory that isn't in our memory map..
+	 * Fix it, but check if it's kernel or user first..
+	 */
+	up_read(&mm->mmap_sem);
+
+	__bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void
+bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+	__bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void
+bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+		      unsigned long address)
+{
+	__bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void
+out_of_memory(struct pt_regs *regs, unsigned long error_code,
+	      unsigned long address)
+{
+	/*
+	 * We ran out of memory, call the OOM killer, and return the userspace
+	 * (which will retry the fault, or kill us if we got oom-killed):
+	 */
+	up_read(&current->mm->mmap_sem);
+
+	pagefault_out_of_memory();
+}
+
+static void
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+	  unsigned int fault)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+	int code = BUS_ADRERR;
+
+	up_read(&mm->mmap_sem);
+
+	/* Kernel mode? Handle exceptions or die: */
+	if (!(error_code & PF_USER)) {
+		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+		return;
+	}
+
+	/* User-space => ok to do another page fault: */
+	if (is_prefetch(regs, error_code, address))
+		return;
+
+	tsk->thread.cr2		= address;
+	tsk->thread.error_code	= error_code;
+	tsk->thread.trap_no	= 14;
+
+#ifdef CONFIG_MEMORY_FAILURE
+	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+		printk(KERN_ERR
+	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+			tsk->comm, tsk->pid, address);
+		code = BUS_MCEERR_AR;
+	}
+#endif
+	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
+}
+
+static noinline int
+mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+	       unsigned long address, unsigned int fault)
+{
+	/*
+	 * Pagefault was interrupted by SIGKILL. We have no reason to
+	 * continue pagefault.
+	 */
+	if (fatal_signal_pending(current)) {
+		if (!(fault & VM_FAULT_RETRY))
+			up_read(&current->mm->mmap_sem);
+		if (!(error_code & PF_USER))
+			no_context(regs, error_code, address, 0, 0);
+		return 1;
+	}
+	if (!(fault & VM_FAULT_ERROR))
+		return 0;
+
+	if (fault & VM_FAULT_OOM) {
+		/* Kernel mode? Handle exceptions or die: */
+		if (!(error_code & PF_USER)) {
+			up_read(&current->mm->mmap_sem);
+			no_context(regs, error_code, address,
+				   SIGSEGV, SEGV_MAPERR);
+			return 1;
+		}
+
+		out_of_memory(regs, error_code, address);
+	} else {
+		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+			     VM_FAULT_HWPOISON_LARGE))
+			do_sigbus(regs, error_code, address, fault);
+		else
+			BUG();
+	}
+	return 1;
+}
+
+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+{
+	if ((error_code & PF_WRITE) && !pte_write(*pte))
+		return 0;
+
+	if ((error_code & PF_INSTR) && !pte_exec(*pte))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ */
+static noinline __kprobes int
+spurious_fault(unsigned long error_code, unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int ret;
+
+	/* Reserved-bit violation or user access to kernel space? */
+	if (error_code & (PF_USER | PF_RSVD))
+		return 0;
+
+	pgd = init_mm.pgd + pgd_index(address);
+	if (!pgd_present(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return 0;
+
+	if (pud_large(*pud))
+		return spurious_fault_check(error_code, (pte_t *) pud);
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return 0;
+
+	if (pmd_large(*pmd))
+		return spurious_fault_check(error_code, (pte_t *) pmd);
+
+	/*
+	 * Note: don't use pte_present() here, since it returns true
+	 * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+	 * _PAGE_GLOBAL bit, which for kernel pages give false positives
+	 * when CONFIG_DEBUG_PAGEALLOC is used.
+	 */
+	pte = pte_offset_kernel(pmd, address);
+	if (!(pte_flags(*pte) & _PAGE_PRESENT))
+		return 0;
+
+	ret = spurious_fault_check(error_code, pte);
+	if (!ret)
+		return 0;
+
+	/*
+	 * Make sure we have permissions in PMD.
+	 * If not, then there's a bug in the page tables:
+	 */
+	ret = spurious_fault_check(error_code, (pte_t *) pmd);
+	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
+
+	return ret;
+}
+
+int show_unhandled_signals = 1;
+
+static inline int
+access_error(unsigned long error_code, struct vm_area_struct *vma)
+{
+	if (error_code & PF_WRITE) {
+		/* write, present and write, not present: */
+		if (unlikely(!(vma->vm_flags & VM_WRITE)))
+			return 1;
+		return 0;
+	}
+
+	/* read, present: */
+	if (unlikely(error_code & PF_PROT))
+		return 1;
+
+	/* read, not present: */
+	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+		return 1;
+
+	return 0;
+}
+
+static int fault_in_kernel_space(unsigned long address)
+{
+	return address >= TASK_SIZE_MAX;
+}
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+dotraplinkage void __kprobes
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+	unsigned long address;
+	struct mm_struct *mm;
+	int fault;
+	int write = error_code & PF_WRITE;
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
+					(write ? FAULT_FLAG_WRITE : 0);
+
+	/* Set the "privileged fault" bit to something sane. */
+	if (user_mode_vm(regs))
+		error_code |= PF_USER;
+	else
+		error_code &= ~PF_USER;
+
+	tsk = current;
+	mm = tsk->mm;
+
+	/* Get the faulting address: */
+	address = read_cr2();
+
+	/*
+	 * Detect and handle instructions that would cause a page fault for
+	 * both a tracked kernel page and a userspace page.
+	 */
+	if (kmemcheck_active(regs))
+		kmemcheck_hide(regs);
+	prefetchw(&mm->mmap_sem);
+
+	if (unlikely(kmmio_fault(regs, address)))
+		return;
+
+	/*
+	 * We fault-in kernel-space virtual memory on-demand. The
+	 * 'reference' page table is init_mm.pgd.
+	 *
+	 * NOTE! We MUST NOT take any locks for this case. We may
+	 * be in an interrupt or a critical region, and should
+	 * only copy the information from the master page table,
+	 * nothing more.
+	 *
+	 * This verifies that the fault happens in kernel space
+	 * (error_code & 4) == 0, and that the fault was not a
+	 * protection error (error_code & 9) == 0.
+	 */
+	if (unlikely(fault_in_kernel_space(address))) {
+		/* Faults in hypervisor area can never be patched up. */
+#if defined(CONFIG_X86_XEN)
+		if (address >= hypervisor_virt_start) {
+#elif defined(CONFIG_X86_64_XEN)
+		if (address >= HYPERVISOR_VIRT_START
+		    && address < HYPERVISOR_VIRT_END) {
+#endif
+			bad_area_nosemaphore(regs, error_code, address);
+			return;
+		}
+
+		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+			if (vmalloc_fault(address) >= 0)
+				return;
+
+			if (kmemcheck_fault(regs, address, error_code))
+				return;
+		}
+
+		/* Can handle a stale RO->RW TLB: */
+		if (spurious_fault(error_code, address))
+			return;
+
+		/* kprobes don't want to hook the spurious faults: */
+		if (notify_page_fault(regs))
+			return;
+		/*
+		 * Don't take the mm semaphore here. If we fixup a prefetch
+		 * fault we could otherwise deadlock:
+		 */
+		bad_area_nosemaphore(regs, error_code, address);
+
+		return;
+	}
+
+	/* kprobes don't want to hook the spurious faults: */
+	if (unlikely(notify_page_fault(regs)))
+		return;
+	/*
+	 * It's safe to allow irq's after cr2 has been saved and the
+	 * vmalloc fault has been handled.
+	 *
+	 * User-mode registers count as a user access even for any
+	 * potential system fault or CPU buglet:
+	 */
+	if (user_mode_vm(regs)) {
+		local_irq_enable();
+		error_code |= PF_USER;
+	} else {
+		if (regs->flags & X86_EFLAGS_IF)
+			local_irq_enable();
+	}
+
+	if (unlikely(error_code & PF_RSVD))
+		pgtable_bad(regs, error_code, address);
+
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+	/*
+	 * If we're in an interrupt, have no user context or are running
+	 * in an atomic region then we must not take the fault:
+	 */
+	if (unlikely(in_atomic() || !mm)) {
+		bad_area_nosemaphore(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * When running in the kernel we expect faults to occur only to
+	 * addresses in user space.  All other faults represent errors in
+	 * the kernel and should generate an OOPS.  Unfortunately, in the
+	 * case of an erroneous fault occurring in a code path which already
+	 * holds mmap_sem we will deadlock attempting to validate the fault
+	 * against the address space.  Luckily the kernel only validly
+	 * references user space from well defined areas of code, which are
+	 * listed in the exceptions table.
+	 *
+	 * As the vast majority of faults will be valid we will only perform
+	 * the source reference check when there is a possibility of a
+	 * deadlock. Attempt to lock the address space, if we cannot we then
+	 * validate the source. If this is invalid we can skip the address
+	 * space check, thus avoiding the deadlock:
+	 */
+	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+		if ((error_code & PF_USER) == 0 &&
+		    !search_exception_tables(regs->ip)) {
+			bad_area_nosemaphore(regs, error_code, address);
+			return;
+		}
+retry:
+		down_read(&mm->mmap_sem);
+	} else {
+		/*
+		 * The above down_read_trylock() might have succeeded in
+		 * which case we'll have missed the might_sleep() from
+		 * down_read():
+		 */
+		might_sleep();
+	}
+
+	vma = find_vma(mm, address);
+	if (unlikely(!vma)) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+	if (likely(vma->vm_start <= address))
+		goto good_area;
+	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+	if (error_code & PF_USER) {
+		/*
+		 * Accessing the stack below %sp is always a bug.
+		 * The large cushion allows instructions like enter
+		 * and pusha to work. ("enter $65535, $31" pushes
+		 * 32 pointers and then decrements %sp by 65535.)
+		 */
+		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+			bad_area(regs, error_code, address);
+			return;
+		}
+	}
+	if (unlikely(expand_stack(vma, address))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * Ok, we have a good vm_area for this memory access, so
+	 * we can handle it..
+	 */
+good_area:
+	if (unlikely(access_error(error_code, vma))) {
+		bad_area_access_error(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * If for any reason at all we couldn't handle the fault,
+	 * make sure we exit gracefully rather than endlessly redo
+	 * the fault:
+	 */
+	fault = handle_mm_fault(mm, vma, address, flags);
+
+	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+		if (mm_fault_error(regs, error_code, address, fault))
+			return;
+	}
+
+	/*
+	 * Major/minor page fault accounting is only done on the
+	 * initial attempt. If we go through a retry, it is extremely
+	 * likely that the page will be found in page cache at that point.
+	 */
+	if (flags & FAULT_FLAG_ALLOW_RETRY) {
+		if (fault & VM_FAULT_MAJOR) {
+			tsk->maj_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+				      regs, address);
+		} else {
+			tsk->min_flt++;
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+				      regs, address);
+		}
+		if (fault & VM_FAULT_RETRY) {
+			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+			 * of starvation. */
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			goto retry;
+		}
+	}
+
+	check_v8086_mode(regs, address, tsk);
+
+	up_read(&mm->mmap_sem);
+}
diff --git a/arch/x86/mm/highmem_32-xen.c b/arch/x86/mm/highmem_32-xen.c
new file mode 100644
index 0000000..8820a70
--- /dev/null
+++ b/arch/x86/mm/highmem_32-xen.c
@@ -0,0 +1,196 @@
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h> /* for totalram_pages */
+
+void *kmap(struct page *page)
+{
+	might_sleep();
+	if (!PageHighMem(page))
+		return page_address(page);
+	return kmap_high(page);
+}
+EXPORT_SYMBOL(kmap);
+
+void kunmap(struct page *page)
+{
+	if (in_interrupt())
+		BUG();
+	if (!PageHighMem(page))
+		return;
+	kunmap_high(page);
+}
+EXPORT_SYMBOL(kunmap);
+
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap it is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ */
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+	unsigned long vaddr;
+	int idx, type;
+
+	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	pagefault_disable();
+
+	if (!PageHighMem(page))
+		return page_address(page);
+
+	type = kmap_atomic_idx_push();
+	idx = type + KM_TYPE_NR*smp_processor_id();
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	BUG_ON(!pte_none(*(kmap_pte-idx)));
+	set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
+	/*arch_flush_lazy_mmu_mode();*/
+
+	return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_prot);
+
+void *__kmap_atomic(struct page *page)
+{
+	return kmap_atomic_prot(page, kmap_prot);
+}
+EXPORT_SYMBOL(__kmap_atomic);
+
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn)
+{
+	return kmap_atomic_prot_pfn(pfn, kmap_prot);
+}
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
+
+void __kunmap_atomic(void *kvaddr)
+{
+	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+		int idx, type;
+
+		type = kmap_atomic_idx();
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+		/*
+		 * Force other mappings to Oops if they'll try to access this
+		 * pte without first remap it.  Keeping stale mappings around
+		 * is a bad idea also, in case the page changes cacheability
+		 * attributes or becomes a protected page in a hypervisor.
+		 */
+		kpte_clear_flush(kmap_pte-idx, vaddr);
+		kmap_atomic_idx_pop();
+		/*arch_flush_lazy_mmu_mode();*/
+	}
+#ifdef CONFIG_DEBUG_HIGHMEM
+	else {
+		BUG_ON(vaddr < PAGE_OFFSET);
+		BUG_ON(vaddr >= (unsigned long)high_memory);
+	}
+#endif
+
+	pagefault_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
+
+struct page *kmap_atomic_to_page(void *ptr)
+{
+	unsigned long idx, vaddr = (unsigned long)ptr;
+	pte_t *pte;
+
+	if (vaddr < FIXADDR_START)
+		return virt_to_page(ptr);
+
+	idx = virt_to_fix(vaddr);
+	pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
+	return pte_page(*pte);
+}
+EXPORT_SYMBOL(kmap_atomic_to_page);
+
+void clear_highpage(struct page *page)
+{
+	void *kaddr;
+
+	if (likely(xen_feature(XENFEAT_highmem_assist))
+	    && PageHighMem(page)) {
+		struct mmuext_op meo;
+
+		meo.cmd = MMUEXT_CLEAR_PAGE;
+		meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page));
+		if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
+			return;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	clear_page(kaddr);
+	kunmap_atomic(kaddr, KM_USER0);
+}
+EXPORT_SYMBOL(clear_highpage);
+
+void copy_highpage(struct page *to, struct page *from)
+{
+	void *vfrom, *vto;
+
+	if (likely(xen_feature(XENFEAT_highmem_assist))
+	    && (PageHighMem(from) || PageHighMem(to))) {
+		unsigned long from_pfn = page_to_pfn(from);
+		unsigned long to_pfn = page_to_pfn(to);
+		struct mmuext_op meo;
+
+		meo.cmd = MMUEXT_COPY_PAGE;
+		meo.arg1.mfn = pfn_to_mfn(to_pfn);
+		meo.arg2.src_mfn = pfn_to_mfn(from_pfn);
+		if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn
+		    && mfn_to_pfn(meo.arg1.mfn) == to_pfn
+		    && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
+			return;
+	}
+
+	vfrom = kmap_atomic(from, KM_USER0);
+	vto = kmap_atomic(to, KM_USER1);
+	copy_page(vto, vfrom);
+	kunmap_atomic(vfrom, KM_USER0);
+	kunmap_atomic(vto, KM_USER1);
+}
+EXPORT_SYMBOL(copy_highpage);
+
+void __init set_highmem_pages_init(void)
+{
+	struct zone *zone;
+	int nid;
+
+	for_each_zone(zone) {
+		unsigned long zone_start_pfn, zone_end_pfn;
+
+		if (!is_highmem(zone))
+			continue;
+
+		zone_start_pfn = zone->zone_start_pfn;
+		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+		nid = zone_to_nid(zone);
+		printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+				zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+		add_highpages_with_active_regions(nid, zone_start_pfn,
+				 zone_end_pfn);
+
+		/* XEN: init high-mem pages outside initial allocation. */
+		if (zone_start_pfn < xen_start_info->nr_pages)
+			zone_start_pfn = xen_start_info->nr_pages;
+		for (; zone_start_pfn < zone_end_pfn; zone_start_pfn++) {
+			ClearPageReserved(pfn_to_page(zone_start_pfn));
+			init_page_count(pfn_to_page(zone_start_pfn));
+		}
+	}
+	totalram_pages += totalhigh_pages;
+}
diff --git a/arch/x86/mm/hypervisor.c b/arch/x86/mm/hypervisor.c
new file mode 100644
index 0000000..9cafdbe
--- /dev/null
+++ b/arch/x86/mm/hypervisor.c
@@ -0,0 +1,1314 @@
+/******************************************************************************
+ * mm/hypervisor.c
+ * 
+ * Update page tables via the hypervisor.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/features.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/vcpu.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include <linux/highmem.h>
+#ifdef CONFIG_X86_32
+#include <linux/bootmem.h> /* for max_pfn */
+#endif
+
+EXPORT_SYMBOL(hypercall_page);
+
+shared_info_t *__read_mostly HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+#ifndef CONFIG_XEN_VCPU_INFO_PLACEMENT
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+#else
+DEFINE_PER_CPU(struct vcpu_info, vcpu_info) __aligned(sizeof(struct vcpu_info));
+EXPORT_PER_CPU_SYMBOL(vcpu_info);
+
+void __ref setup_vcpu_info(unsigned int cpu)
+{
+	struct vcpu_info *v = &per_cpu(vcpu_info, cpu);
+	struct vcpu_register_vcpu_info info;
+#ifdef CONFIG_X86_64
+	static bool first = true;
+
+	if (first) {
+		first = false;
+		info.mfn = early_arbitrary_virt_to_mfn(v);
+	} else
+#endif
+		info.mfn = arbitrary_virt_to_mfn(v);
+	info.offset = offset_in_page(v);
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info))
+		BUG();
+}
+
+void __init adjust_boot_vcpu_info(void)
+{
+	unsigned long lpfn, rpfn, lmfn, rmfn;
+	pte_t *lpte, *rpte;
+	unsigned int level;
+	mmu_update_t mmu[2];
+
+	/*
+	 * setup_vcpu_info() cannot be used more than once for a given (v)CPU,
+	 * hence we must swap the underlying MFNs of the two pages holding old
+	 * and new vcpu_info of the boot CPU.
+	 *
+	 * Do *not* use __get_cpu_var() or percpu_{write,...}() here, as the per-
+	 * CPU segment didn't get reloaded yet. Using percpu_read(), as in
+	 * arch_use_lazy_mmu_mode(), though undesirable, is safe except for the
+	 * accesses to variables that were updated in setup_percpu_areas().
+	 */
+	lpte = lookup_address((unsigned long)&vcpu_info
+			      + (__per_cpu_load - __per_cpu_start),
+			      &level);
+	rpte = lookup_address((unsigned long)&per_cpu(vcpu_info, 0), &level);
+	BUG_ON(!lpte || !(pte_flags(*lpte) & _PAGE_PRESENT));
+	BUG_ON(!rpte || !(pte_flags(*rpte) & _PAGE_PRESENT));
+	lmfn = __pte_mfn(*lpte);
+	rmfn = __pte_mfn(*rpte);
+
+	if (lmfn == rmfn)
+		return;
+
+	lpfn = mfn_to_local_pfn(lmfn);
+	rpfn = mfn_to_local_pfn(rmfn);
+
+	pr_info("Swapping MFNs for PFN %lx and %lx (MFN %lx and %lx)\n",
+		lpfn, rpfn, lmfn, rmfn);
+
+	xen_l1_entry_update(lpte, pfn_pte_ma(rmfn, pte_pgprot(*lpte)));
+	xen_l1_entry_update(rpte, pfn_pte_ma(lmfn, pte_pgprot(*rpte)));
+#ifdef CONFIG_X86_64
+	if (HYPERVISOR_update_va_mapping((unsigned long)__va(lpfn<<PAGE_SHIFT),
+					 pfn_pte_ma(rmfn, PAGE_KERNEL_RO), 0))
+		BUG();
+#endif
+	if (HYPERVISOR_update_va_mapping((unsigned long)__va(rpfn<<PAGE_SHIFT),
+					 pfn_pte_ma(lmfn, PAGE_KERNEL),
+					 UVMF_TLB_FLUSH))
+		BUG();
+
+	set_phys_to_machine(lpfn, rmfn);
+	set_phys_to_machine(rpfn, lmfn);
+
+	mmu[0].ptr = ((uint64_t)lmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+	mmu[0].val = rpfn;
+	mmu[1].ptr = ((uint64_t)rmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+	mmu[1].val = lpfn;
+	if (HYPERVISOR_mmu_update(mmu, 2, NULL, DOMID_SELF))
+		BUG();
+
+	/*
+	 * Copy over all contents of the page just replaced, except for the
+	 * vcpu_info itself, as it may have got updated after having been
+	 * copied from __per_cpu_load[].
+	 */
+	memcpy(__va(rpfn << PAGE_SHIFT),
+	       __va(lpfn << PAGE_SHIFT),
+	       (unsigned long)&vcpu_info & (PAGE_SIZE - 1));
+	level = (unsigned long)(&vcpu_info + 1) & (PAGE_SIZE - 1);
+	if (level)
+		memcpy(__va(rpfn << PAGE_SHIFT) + level,
+		       __va(lpfn << PAGE_SHIFT) + level,
+		       PAGE_SIZE - level);
+}
+#endif
+
+#define NR_MC     BITS_PER_LONG
+#define NR_MMU    BITS_PER_LONG
+#define NR_MMUEXT (BITS_PER_LONG / 4)
+
+DEFINE_PER_CPU(bool, xen_lazy_mmu);
+struct lazy_mmu {
+	unsigned int nr_mc, nr_mmu, nr_mmuext;
+	multicall_entry_t mc[NR_MC];
+	mmu_update_t mmu[NR_MMU];
+	struct mmuext_op mmuext[NR_MMUEXT];
+};
+static DEFINE_PER_CPU(struct lazy_mmu, lazy_mmu);
+
+static inline bool use_lazy_mmu_mode(void)
+{
+#ifdef CONFIG_PREEMPT
+	if (!preempt_count())
+		return false;
+#endif
+	return !irq_count();
+}
+
+static void multicall_failed(const multicall_entry_t *mc, int rc)
+{
+	pr_emerg("hypercall#%lu(%lx, %lx, %lx, %lx) failed: %d"
+		 " (caller %lx)\n",
+	       mc->op, mc->args[0], mc->args[1], mc->args[2], mc->args[3],
+	       rc, mc->args[5]);
+	BUG();
+}
+
+static int _xen_multicall_flush(bool ret_last) {
+	struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
+	multicall_entry_t *mc = lazy->mc;
+	unsigned int count = lazy->nr_mc;
+
+	if (!count)
+		return 0;
+
+	lazy->nr_mc = 0;
+	lazy->nr_mmu = 0;
+	lazy->nr_mmuext = 0;
+
+	if (count == 1) {
+		int rc = _hypercall(int, mc->op, mc->args[0], mc->args[1],
+				    mc->args[2], mc->args[3], mc->args[4]);
+
+		if (unlikely(rc)) {
+			if (ret_last)
+				return rc;
+			multicall_failed(mc, rc);
+		}
+	} else {
+		if (HYPERVISOR_multicall(mc, count))
+			BUG();
+		while (count-- > ret_last)
+			if (unlikely(mc++->result))
+				multicall_failed(mc - 1, mc[-1].result);
+		if (ret_last)
+			return mc->result;
+	}
+
+	return 0;
+}
+
+void xen_multicall_flush(void) {
+	if (use_lazy_mmu_mode())
+		_xen_multicall_flush(false);
+}
+
+int xen_multi_update_va_mapping(unsigned long va, pte_t pte,
+				unsigned long uvmf)
+{
+	struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
+	multicall_entry_t *mc;
+
+	if (unlikely(!use_lazy_mmu_mode()))
+#ifdef CONFIG_X86_PAE
+		return _hypercall4(int, update_va_mapping, va,
+				   pte.pte_low, pte.pte_high, uvmf);
+#else
+		return _hypercall3(int, update_va_mapping, va,
+				   pte.pte, uvmf);
+#endif
+
+	if (unlikely(lazy->nr_mc == NR_MC))
+		_xen_multicall_flush(false);
+
+	mc = lazy->mc + lazy->nr_mc++;
+	mc->op = __HYPERVISOR_update_va_mapping;
+	mc->args[0] = va;
+#ifndef CONFIG_X86_PAE
+	mc->args[1] = pte.pte;
+#else
+	mc->args[1] = pte.pte_low;
+	mc->args[2] = pte.pte_high;
+#endif
+	mc->args[MULTI_UVMFLAGS_INDEX] = uvmf;
+	mc->args[5] = (long)__builtin_return_address(0);
+
+	return 0;
+}
+
+static inline bool mmu_may_merge(const multicall_entry_t *mc,
+				 unsigned int op, domid_t domid)
+{
+	return mc->op == op && !mc->args[2] && mc->args[3] == domid;
+}
+
+int xen_multi_mmu_update(mmu_update_t *src, unsigned int count,
+			 unsigned int *success_count, domid_t domid)
+{
+	struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
+	multicall_entry_t *mc = lazy->mc + lazy->nr_mc;
+	mmu_update_t *dst;
+	bool commit, merge;
+
+	if (unlikely(!use_lazy_mmu_mode()))
+		return _hypercall4(int, mmu_update, src, count,
+				   success_count, domid);
+
+	commit = (lazy->nr_mmu + count) > NR_MMU || success_count;
+	merge = lazy->nr_mc && !commit
+		&& mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid);
+	if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
+		_xen_multicall_flush(false);
+		mc = lazy->mc;
+		commit = count > NR_MMU || success_count;
+	}
+
+	if (!lazy->nr_mc && unlikely(commit))
+		return _hypercall4(int, mmu_update, src, count,
+				   success_count, domid);
+
+	dst = lazy->mmu + lazy->nr_mmu;
+	lazy->nr_mmu += count;
+	if (merge) {
+		mc[-1].args[1] += count;
+		memcpy(dst, src, count * sizeof(*src));
+	} else {
+		++lazy->nr_mc;
+		mc->op = __HYPERVISOR_mmu_update;
+		if (!commit) {
+			mc->args[0] = (unsigned long)dst;
+			memcpy(dst, src, count * sizeof(*src));
+		} else
+			mc->args[0] = (unsigned long)src;
+		mc->args[1] = count;
+		mc->args[2] = (unsigned long)success_count;
+		mc->args[3] = domid;
+		mc->args[5] = (long)__builtin_return_address(0);
+	}
+
+	while (!commit && count--)
+		switch (src++->ptr & (sizeof(pteval_t) - 1)) {
+		case MMU_NORMAL_PT_UPDATE:
+		case MMU_PT_UPDATE_PRESERVE_AD:
+			break;
+		default:
+			commit = true;
+			break;
+		}
+
+	return commit ? _xen_multicall_flush(true) : 0;
+}
+
+int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count,
+			unsigned int *success_count, domid_t domid)
+{
+	struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
+	multicall_entry_t *mc;
+	struct mmuext_op *dst;
+	bool commit, merge;
+
+	if (unlikely(!use_lazy_mmu_mode()))
+		return _hypercall4(int, mmuext_op, src, count,
+				   success_count, domid);
+
+	/*
+	 * While it could be useful in theory, I've never seen the body of
+	 * this conditional to be reached, hence it seems more reasonable
+	 * to disable it for the time being.
+	 */
+	if (0 && likely(count)
+	    && likely(!success_count)
+	    && likely(domid == DOMID_SELF)
+	    && likely(lazy->nr_mc)
+	    && lazy->mc[lazy->nr_mc - 1].op == __HYPERVISOR_update_va_mapping) {
+		unsigned long oldf, newf = UVMF_NONE;
+
+		switch (src->cmd) {
+		case MMUEXT_TLB_FLUSH_ALL:
+			newf = UVMF_TLB_FLUSH | UVMF_ALL;
+			break;
+		case MMUEXT_INVLPG_ALL:
+			newf = UVMF_INVLPG | UVMF_ALL;
+			break;
+		case MMUEXT_TLB_FLUSH_MULTI:
+			newf = UVMF_TLB_FLUSH | UVMF_MULTI
+			       | (unsigned long)src->arg2.vcpumask.p;
+			break;
+		case MMUEXT_INVLPG_MULTI:
+			newf = UVMF_INVLPG | UVMF_MULTI
+			       | (unsigned long)src->arg2.vcpumask.p;
+			break;
+		case MMUEXT_TLB_FLUSH_LOCAL:
+			newf = UVMF_TLB_FLUSH | UVMF_LOCAL;
+			break;
+		case MMUEXT_INVLPG_LOCAL:
+			newf = UVMF_INVLPG | UVMF_LOCAL;
+			break;
+		}
+		mc = lazy->mc + lazy->nr_mc - 1;
+		oldf = mc->args[MULTI_UVMFLAGS_INDEX];
+		if (newf == UVMF_NONE || oldf == UVMF_NONE
+		    || newf == (UVMF_TLB_FLUSH | UVMF_ALL))
+			;
+		else if (oldf == (UVMF_TLB_FLUSH | UVMF_ALL))
+			newf = UVMF_TLB_FLUSH | UVMF_ALL;
+		else if ((newf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
+			 && (oldf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
+			 && ((src->arg1.linear_addr ^ mc->args[0])
+			     >> PAGE_SHIFT))
+			newf = UVMF_NONE;
+		else if (((oldf | newf) & UVMF_ALL)
+			 && !((oldf ^ newf) & UVMF_FLUSHTYPE_MASK))
+			newf |= UVMF_ALL;
+		else if ((oldf ^ newf) & ~UVMF_FLUSHTYPE_MASK)
+			newf = UVMF_NONE;
+		else if ((oldf & UVMF_FLUSHTYPE_MASK) == UVMF_TLB_FLUSH)
+			newf = (newf & ~UVMF_FLUSHTYPE_MASK) | UVMF_TLB_FLUSH;
+		else if ((newf & UVMF_FLUSHTYPE_MASK) != UVMF_TLB_FLUSH
+			 && ((newf ^ oldf) & UVMF_FLUSHTYPE_MASK))
+			newf = UVMF_NONE;
+		if (newf != UVMF_NONE) {
+			mc->args[MULTI_UVMFLAGS_INDEX] = newf;
+			++src;
+			if (!--count)
+				return 0;
+		}
+	}
+
+	mc = lazy->mc + lazy->nr_mc;
+	commit = (lazy->nr_mmuext + count) > NR_MMUEXT || success_count;
+	merge = lazy->nr_mc && !commit
+		&& mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid);
+	if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
+		_xen_multicall_flush(false);
+		mc = lazy->mc;
+		commit = count > NR_MMUEXT || success_count;
+	}
+
+	if (!lazy->nr_mc && unlikely(commit))
+		return _hypercall4(int, mmuext_op, src, count,
+				   success_count, domid);
+
+	dst = lazy->mmuext + lazy->nr_mmuext;
+	lazy->nr_mmuext += count;
+	if (merge) {
+		mc[-1].args[1] += count;
+		memcpy(dst, src, count * sizeof(*src));
+	} else {
+		++lazy->nr_mc;
+		mc->op = __HYPERVISOR_mmuext_op;
+		if (!commit) {
+			mc->args[0] = (unsigned long)dst;
+			memcpy(dst, src, count * sizeof(*src));
+		} else
+			mc->args[0] = (unsigned long)src;
+		mc->args[1] = count;
+		mc->args[2] = (unsigned long)success_count;
+		mc->args[3] = domid;
+		mc->args[5] = (long)__builtin_return_address(0);
+	}
+
+	while (!commit && count--)
+		switch (src++->cmd) {
+		case MMUEXT_PIN_L1_TABLE:
+		case MMUEXT_PIN_L2_TABLE:
+		case MMUEXT_PIN_L3_TABLE:
+		case MMUEXT_PIN_L4_TABLE:
+		case MMUEXT_UNPIN_TABLE:
+		case MMUEXT_TLB_FLUSH_LOCAL:
+		case MMUEXT_INVLPG_LOCAL:
+		case MMUEXT_TLB_FLUSH_MULTI:
+		case MMUEXT_INVLPG_MULTI:
+		case MMUEXT_TLB_FLUSH_ALL:
+		case MMUEXT_INVLPG_ALL:
+			break;
+		default:
+			commit = true;
+			break;
+		}
+
+	return commit ? _xen_multicall_flush(true) : 0;
+}
+
+void xen_l1_entry_update(pte_t *ptr, pte_t val)
+{
+	mmu_update_t u;
+	u.ptr = ptep_to_machine(ptr);
+	u.val = __pte_val(val);
+	BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_l1_entry_update);
+
+static void do_lN_entry_update(mmu_update_t *mmu, unsigned int mmu_count,
+                               struct page *page)
+{
+	if (likely(page)) {
+		multicall_entry_t mcl[2];
+		unsigned long pfn = page_to_pfn(page);
+
+		MULTI_update_va_mapping(mcl,
+					(unsigned long)__va(pfn << PAGE_SHIFT),
+					pfn_pte(pfn, PAGE_KERNEL_RO), 0);
+		SetPagePinned(page);
+		MULTI_mmu_update(mcl + 1, mmu, mmu_count, NULL, DOMID_SELF);
+		if (unlikely(HYPERVISOR_multicall_check(mcl, 2, NULL)))
+			BUG();
+	} else if (unlikely(HYPERVISOR_mmu_update(mmu, mmu_count,
+						  NULL, DOMID_SELF) < 0))
+		BUG();
+}
+
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
+{
+	mmu_update_t u;
+	struct page *page = NULL;
+
+	if (likely(pmd_present(val)) && likely(!pmd_large(val))
+	    && likely(mem_map)
+	    && likely(PagePinned(virt_to_page(ptr)))) {
+		page = pmd_page(val);
+		if (unlikely(PagePinned(page)))
+			page = NULL;
+		else if (PageHighMem(page)) {
+#ifndef CONFIG_HIGHPTE
+			BUG();
+#endif
+			kmap_flush_unused();
+			page = NULL;
+		}
+	}
+	u.ptr = virt_to_machine(ptr);
+	u.val = __pmd_val(val);
+	do_lN_entry_update(&u, 1, page);
+}
+
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+void xen_l3_entry_update(pud_t *ptr, pud_t val)
+{
+	mmu_update_t u;
+	struct page *page = NULL;
+
+	if (likely(pud_present(val))
+#ifdef CONFIG_X86_64
+	    && likely(!pud_large(val))
+#endif
+	    && likely(mem_map)
+	    && likely(PagePinned(virt_to_page(ptr)))) {
+		page = pud_page(val);
+		if (unlikely(PagePinned(page)))
+			page = NULL;
+	}
+	u.ptr = virt_to_machine(ptr);
+	u.val = __pud_val(val);
+	do_lN_entry_update(&u, 1, page);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
+{
+	mmu_update_t u[2];
+	struct page *page = NULL;
+
+	if (likely(pgd_present(val)) && likely(mem_map)
+	    && likely(PagePinned(virt_to_page(ptr)))) {
+		page = pgd_page(val);
+		if (unlikely(PagePinned(page)))
+			page = NULL;
+	}
+	u[0].ptr = virt_to_machine(ptr);
+	u[0].val = __pgd_val(val);
+	if (((unsigned long)ptr & ~PAGE_MASK)
+	    <= pgd_index(TASK_SIZE_MAX) * sizeof(*ptr)) {
+		ptr = __user_pgd(ptr);
+		BUG_ON(!ptr);
+		u[1].ptr = virt_to_machine(ptr);
+		u[1].val = __pgd_val(val);
+		do_lN_entry_update(u, 2, page);
+	} else
+		do_lN_entry_update(u, 1, page);
+}
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_X86_64
+void xen_pt_switch(pgd_t *pgd)
+{
+	struct mmuext_op op;
+	op.cmd = MMUEXT_NEW_BASEPTR;
+	op.arg1.mfn = virt_to_mfn(pgd);
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_new_user_pt(pgd_t *pgd)
+{
+	struct mmuext_op op;
+
+	pgd = __user_pgd(pgd);
+	op.cmd = MMUEXT_NEW_USER_BASEPTR;
+	op.arg1.mfn = pgd ? virt_to_mfn(pgd) : 0;
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+#endif
+
+void xen_tlb_flush(void)
+{
+	struct mmuext_op op;
+	op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL(xen_tlb_flush);
+
+void xen_invlpg(unsigned long ptr)
+{
+	struct mmuext_op op;
+	op.cmd = MMUEXT_INVLPG_LOCAL;
+	op.arg1.linear_addr = ptr & PAGE_MASK;
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL(xen_invlpg);
+
+#ifdef CONFIG_SMP
+
+void xen_tlb_flush_all(void)
+{
+	struct mmuext_op op;
+	op.cmd = MMUEXT_TLB_FLUSH_ALL;
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_tlb_flush_all);
+
+void xen_tlb_flush_mask(const cpumask_t *mask)
+{
+	struct mmuext_op op;
+	if ( cpus_empty(*mask) )
+		return;
+	op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+	set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_tlb_flush_mask);
+
+void xen_invlpg_all(unsigned long ptr)
+{
+	struct mmuext_op op;
+	op.cmd = MMUEXT_INVLPG_ALL;
+	op.arg1.linear_addr = ptr & PAGE_MASK;
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_invlpg_all);
+
+void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr)
+{
+	struct mmuext_op op;
+	if ( cpus_empty(*mask) )
+		return;
+	op.cmd = MMUEXT_INVLPG_MULTI;
+	op.arg1.linear_addr = ptr & PAGE_MASK;
+	set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_invlpg_mask);
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_X86_64
+#define NR_PGD_PIN_OPS 2
+#else
+#define NR_PGD_PIN_OPS 1
+#endif
+
+void xen_pgd_pin(pgd_t *pgd)
+{
+	struct mmuext_op op[NR_PGD_PIN_OPS];
+
+	op[0].cmd = MMUEXT_PIN_L3_TABLE;
+	op[0].arg1.mfn = virt_to_mfn(pgd);
+#ifdef CONFIG_X86_64
+	op[1].cmd = op[0].cmd = MMUEXT_PIN_L4_TABLE;
+	pgd = __user_pgd(pgd);
+	if (pgd)
+		op[1].arg1.mfn = virt_to_mfn(pgd);
+	else {
+		op[1].cmd = MMUEXT_PIN_L3_TABLE;
+		op[1].arg1.mfn = pfn_to_mfn(__pa_symbol(level3_user_pgt)
+					    >> PAGE_SHIFT);
+	}
+#endif
+	if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+
+void xen_pgd_unpin(pgd_t *pgd)
+{
+	struct mmuext_op op[NR_PGD_PIN_OPS];
+
+	op[0].cmd = MMUEXT_UNPIN_TABLE;
+	op[0].arg1.mfn = virt_to_mfn(pgd);
+#ifdef CONFIG_X86_64
+	pgd = __user_pgd(pgd);
+	BUG_ON(!pgd);
+	op[1].cmd = MMUEXT_UNPIN_TABLE;
+	op[1].arg1.mfn = virt_to_mfn(pgd);
+#endif
+	if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
+		BUG();
+}
+
+void xen_set_ldt(const void *ptr, unsigned int ents)
+{
+	struct mmuext_op op;
+	op.cmd = MMUEXT_SET_LDT;
+	op.arg1.linear_addr = (unsigned long)ptr;
+	op.arg2.nr_ents     = ents;
+	BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+/* Protected by balloon_lock. */
+#define INIT_CONTIG_ORDER 6 /* 256kB */
+static unsigned int __read_mostly max_contig_order = INIT_CONTIG_ORDER;
+static unsigned long __initdata init_df[1U << INIT_CONTIG_ORDER];
+static unsigned long *__refdata discontig_frames = init_df;
+static multicall_entry_t __initdata init_mc[1U << INIT_CONTIG_ORDER];
+static multicall_entry_t *__refdata cr_mcl = init_mc;
+
+static int __init init_contig_order(void)
+{
+	discontig_frames = vmalloc((sizeof(*discontig_frames)
+				    + sizeof(*cr_mcl)) << INIT_CONTIG_ORDER);
+	BUG_ON(!discontig_frames);
+
+	cr_mcl = (void *)(discontig_frames + (1U << INIT_CONTIG_ORDER));
+
+	return 0;
+}
+early_initcall(init_contig_order);
+
+static int check_contig_order(unsigned int order)
+{
+#ifdef CONFIG_64BIT
+	if (unlikely(order >= 32))
+#else
+	if (unlikely(order > BITS_PER_LONG - fls(sizeof(*cr_mcl))))
+#endif
+		return -ENOMEM;
+
+	if (unlikely(order > max_contig_order))
+	{
+		unsigned long *df = __vmalloc((sizeof(*discontig_frames)
+					       + sizeof(*cr_mcl)) << order,
+					      GFP_ATOMIC, PAGE_KERNEL);
+		unsigned long flags;
+
+		if (!df) {
+			vfree(df);
+			return -ENOMEM;
+		}
+		balloon_lock(flags);
+		if (order > max_contig_order) {
+			void *temp = discontig_frames;
+
+			discontig_frames = df;
+			cr_mcl = (void *)(df + (1U << order));
+			df = temp;
+
+			wmb();
+			max_contig_order = order;
+		}
+		balloon_unlock(flags);
+		vfree(df);
+		pr_info("Adjusted maximum contiguous region order to %u\n",
+			order);
+	}
+
+	return 0;
+}
+
+/* Ensure multi-page extents are contiguous in machine memory. */
+int xen_create_contiguous_region(
+	unsigned long vstart, unsigned int order, unsigned int address_bits)
+{
+	unsigned long *in_frames, out_frame, frame, flags;
+	unsigned int   i;
+	int            rc, success;
+#ifdef CONFIG_64BIT
+	pte_t         *ptep = NULL;
+#endif
+	struct xen_memory_exchange exchange = {
+		.in = {
+			.nr_extents   = 1UL << order,
+			.extent_order = 0,
+			.domid        = DOMID_SELF
+		},
+		.out = {
+			.nr_extents   = 1,
+			.extent_order = order,
+			.address_bits = address_bits,
+			.domid        = DOMID_SELF
+		}
+	};
+
+	/*
+	 * Currently an auto-translated guest will not perform I/O, nor will
+	 * it require PAE page directories below 4GB. Therefore any calls to
+	 * this function are redundant and can be ignored.
+	 */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
+	rc = check_contig_order(order);
+	if (unlikely(rc))
+		return rc;
+
+#ifdef CONFIG_64BIT
+	if (unlikely(vstart > PAGE_OFFSET + MAXMEM)) {
+		unsigned int level;
+
+		if (vstart < __START_KERNEL_map
+		    || vstart + (PAGE_SIZE << order) > _brk_end)
+			return -EINVAL;
+		ptep = lookup_address((unsigned long)__va(__pa(vstart)),
+				      &level);
+		if (ptep && pte_none(*ptep))
+			ptep = NULL;
+		if (vstart < __START_KERNEL && ptep)
+			return -EINVAL;
+		rc = check_contig_order(order + 1);
+		if (unlikely(rc))
+			return rc;
+	}
+#else
+	if (unlikely(vstart + (PAGE_SIZE << order) > (unsigned long)high_memory))
+		return -EINVAL;
+#endif
+
+	set_xen_guest_handle(exchange.out.extent_start, &out_frame);
+
+	scrub_pages((void *)vstart, 1 << order);
+
+	balloon_lock(flags);
+
+	in_frames = discontig_frames;
+	set_xen_guest_handle(exchange.in.extent_start, in_frames);
+
+	/* 1. Zap current PTEs, remembering MFNs. */
+	for (i = 0; i < (1U<<order); i++) {
+		in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
+		MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+					__pte_ma(0), 0);
+#ifdef CONFIG_64BIT
+		if (ptep)
+			MULTI_update_va_mapping(cr_mcl + i + (1U << order),
+				(unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
+				__pte_ma(0), 0);
+#endif
+		set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
+			INVALID_P2M_ENTRY);
+	}
+#ifdef CONFIG_64BIT
+	if (ptep)
+		i += i;
+#endif
+	if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+		BUG();
+
+	/* 2. Get a new contiguous memory extent. */
+	out_frame = __pa(vstart) >> PAGE_SHIFT;
+	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+	success = (exchange.nr_exchanged == (1UL << order));
+	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+	BUG_ON(success && (rc != 0));
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (unlikely(rc == -ENOSYS)) {
+		/* Compatibility when XENMEM_exchange is unsupported. */
+		if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					 &exchange.in) != (1UL << order))
+			BUG();
+		success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+						&exchange.out) == 1);
+		if (!success) {
+			/* Couldn't get special memory: fall back to normal. */
+			for (i = 0; i < (1U<<order); i++)
+				in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
+			if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+						 &exchange.in) != (1UL<<order))
+				BUG();
+		}
+	}
+#endif
+
+	/* 3. Map the new extent in place of old pages. */
+	for (i = 0; i < (1U<<order); i++) {
+		frame = success ? (out_frame + i) : in_frames[i];
+		MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+					pfn_pte_ma(frame, PAGE_KERNEL), 0);
+#ifdef CONFIG_64BIT
+		if (ptep)
+			MULTI_update_va_mapping(cr_mcl + i + (1U << order),
+				(unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
+				pfn_pte_ma(frame, PAGE_KERNEL_RO), 0);
+#endif
+		set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
+	}
+#ifdef CONFIG_64BIT
+	if (ptep)
+		i += i;
+#endif
+	cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
+						   ? UVMF_TLB_FLUSH|UVMF_ALL
+						   : UVMF_INVLPG|UVMF_ALL;
+	if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+		BUG();
+
+	balloon_unlock(flags);
+
+	return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+{
+	unsigned long *out_frames, in_frame, frame, flags;
+	unsigned int   i;
+	int            rc, success;
+	struct xen_memory_exchange exchange = {
+		.in = {
+			.nr_extents   = 1,
+			.extent_order = order,
+			.domid        = DOMID_SELF
+		},
+		.out = {
+			.nr_extents   = 1UL << order,
+			.extent_order = 0,
+			.domid        = DOMID_SELF
+		}
+	};
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	if (unlikely(order > max_contig_order))
+		return;
+
+	set_xen_guest_handle(exchange.in.extent_start, &in_frame);
+
+	scrub_pages((void *)vstart, 1 << order);
+
+	balloon_lock(flags);
+
+	out_frames = discontig_frames;
+	set_xen_guest_handle(exchange.out.extent_start, out_frames);
+
+	/* 1. Find start MFN of contiguous extent. */
+	in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
+
+	/* 2. Zap current PTEs. */
+	for (i = 0; i < (1U<<order); i++) {
+		MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+					__pte_ma(0), 0);
+		set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
+			INVALID_P2M_ENTRY);
+		out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
+	}
+	if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+		BUG();
+
+	/* 3. Do the exchange for non-contiguous MFNs. */
+	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+	success = (exchange.nr_exchanged == 1);
+	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+	BUG_ON(success && (rc != 0));
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (unlikely(rc == -ENOSYS)) {
+		/* Compatibility when XENMEM_exchange is unsupported. */
+		if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					 &exchange.in) != 1)
+			BUG();
+		if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+					 &exchange.out) != (1UL << order))
+			BUG();
+		success = 1;
+	}
+#endif
+
+	/* 4. Map new pages in place of old pages. */
+	for (i = 0; i < (1U<<order); i++) {
+		frame = success ? out_frames[i] : (in_frame + i);
+		MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+					pfn_pte_ma(frame, PAGE_KERNEL), 0);
+		set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
+	}
+
+	cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
+						   ? UVMF_TLB_FLUSH|UVMF_ALL
+						   : UVMF_INVLPG|UVMF_ALL;
+	if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+		BUG();
+
+	balloon_unlock(flags);
+
+	if (unlikely(!success)) {
+		/* Try hard to get the special memory back to Xen. */
+		exchange.in.extent_order = 0;
+		set_xen_guest_handle(exchange.in.extent_start, &in_frame);
+
+		for (i = 0; i < (1U<<order); i++) {
+			struct page *page = alloc_page(__GFP_HIGHMEM|__GFP_COLD);
+			unsigned long pfn;
+			mmu_update_t mmu;
+			unsigned int j = 0;
+
+			if (!page) {
+				pr_warn("Xen and kernel out of memory"
+					" while trying to release an order"
+					" %u contiguous region\n", order);
+				break;
+			}
+			pfn = page_to_pfn(page);
+
+			balloon_lock(flags);
+
+			if (!PageHighMem(page)) {
+				void *v = __va(pfn << PAGE_SHIFT);
+
+				scrub_pages(v, 1);
+				MULTI_update_va_mapping(cr_mcl + j, (unsigned long)v,
+							__pte_ma(0), UVMF_INVLPG|UVMF_ALL);
+				++j;
+			}
+#ifdef CONFIG_XEN_SCRUB_PAGES
+			else {
+				scrub_pages(kmap(page), 1);
+				kunmap(page);
+				kmap_flush_unused();
+			}
+#endif
+
+			frame = pfn_to_mfn(pfn);
+			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+
+			MULTI_update_va_mapping(cr_mcl + j, vstart,
+						pfn_pte_ma(frame, PAGE_KERNEL),
+						UVMF_INVLPG|UVMF_ALL);
+			++j;
+
+			pfn = __pa(vstart) >> PAGE_SHIFT;
+			set_phys_to_machine(pfn, frame);
+			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+				mmu.ptr = ((uint64_t)frame << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+				mmu.val = pfn;
+				cr_mcl[j].op = __HYPERVISOR_mmu_update;
+				cr_mcl[j].args[0] = (unsigned long)&mmu;
+				cr_mcl[j].args[1] = 1;
+				cr_mcl[j].args[2] = 0;
+				cr_mcl[j].args[3] = DOMID_SELF;
+				++j;
+			}
+
+			cr_mcl[j].op = __HYPERVISOR_memory_op;
+			cr_mcl[j].args[0] = XENMEM_decrease_reservation;
+			cr_mcl[j].args[1] = (unsigned long)&exchange.in;
+
+			if (HYPERVISOR_multicall(cr_mcl, j + 1))
+				BUG();
+			BUG_ON(cr_mcl[j].result != 1);
+			while (j--)
+				BUG_ON(cr_mcl[j].result != 0);
+
+			balloon_unlock(flags);
+
+			free_empty_pages(&page, 1);
+
+			in_frame++;
+			vstart += PAGE_SIZE;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+int __init early_create_contiguous_region(unsigned long pfn,
+					  unsigned int order,
+					  unsigned int address_bits)
+{
+	unsigned long *in_frames = discontig_frames, out_frame = pfn;
+	unsigned int i;
+	int rc, success;
+	struct xen_memory_exchange exchange = {
+		.in = {
+			.nr_extents   = 1UL << order,
+			.extent_order = 0,
+			.domid        = DOMID_SELF
+		},
+		.out = {
+			.nr_extents   = 1,
+			.extent_order = order,
+			.address_bits = address_bits,
+			.domid        = DOMID_SELF
+		}
+	};
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
+	if (unlikely(order > max_contig_order))
+		return -ENOMEM;
+
+	for (i = 0; i < (1U << order); ++i) {
+		in_frames[i] = pfn_to_mfn(pfn + i);
+		set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
+	}
+
+	set_xen_guest_handle(exchange.in.extent_start, in_frames);
+	set_xen_guest_handle(exchange.out.extent_start, &out_frame);
+
+	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+	success = (exchange.nr_exchanged == (1UL << order));
+	BUG_ON(!success && (exchange.nr_exchanged || !rc));
+	BUG_ON(success && rc);
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (unlikely(rc == -ENOSYS)) {
+		/* Compatibility when XENMEM_exchange is unavailable. */
+		if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					 &exchange.in) != (1UL << order))
+			BUG();
+		success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+						&exchange.out) == 1);
+		if (!success) {
+			for (i = 0; i < (1U << order); ++i)
+				in_frames[i] = pfn + i;
+			if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+						 &exchange.in) != (1UL << order))
+				BUG();
+		}
+	}
+#endif
+
+	for (i = 0; i < (1U << order); ++i, ++out_frame) {
+		if (!success)
+			out_frame = in_frames[i];
+		set_phys_to_machine(pfn + i, out_frame);
+	}
+
+	return success ? 0 : -ENOMEM;
+}
+
+static void undo_limit_pages(struct page *pages, unsigned int order)
+{
+	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+	BUG_ON(order > max_contig_order);
+	xen_limit_pages_to_max_mfn(pages, order, 0);
+	ClearPageForeign(pages);
+	init_page_count(pages);
+	__free_pages(pages, order);
+}
+
+int xen_limit_pages_to_max_mfn(
+	struct page *pages, unsigned int order, unsigned int address_bits)
+{
+	unsigned long flags, frame, *limit_map, _limit_map;
+	unsigned long *in_frames, *out_frames;
+	struct page *page;
+	unsigned int i, n, nr_mcl;
+	int rc, success;
+
+	struct xen_memory_exchange exchange = {
+		.in = {
+			.extent_order = 0,
+			.domid        = DOMID_SELF
+		},
+		.out = {
+			.extent_order = 0,
+			.address_bits = address_bits,
+			.domid        = DOMID_SELF
+		}
+	};
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
+	if (address_bits && address_bits < PAGE_SHIFT)
+		return -EINVAL;
+
+	rc = check_contig_order(order + 1);
+	if (unlikely(rc))
+		return rc;
+
+	if (BITS_PER_LONG >> order) {
+		limit_map = kmalloc(BITS_TO_LONGS(1U << order)
+				    * sizeof(*limit_map), GFP_ATOMIC);
+		if (unlikely(!limit_map))
+			return -ENOMEM;
+	} else
+		limit_map = &_limit_map;
+
+	if (address_bits)
+		bitmap_zero(limit_map, 1U << order);
+	else if (order) {
+		BUILD_BUG_ON(sizeof(pages->index) != sizeof(*limit_map));
+		for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
+			limit_map[i] = pages[i + 1].index;
+	} else
+		__set_bit(0, limit_map);
+
+	/* 0. Scrub the pages. */
+	for (i = 0, n = 0; i < 1U<<order ; i++) {
+		page = &pages[i];
+		if (address_bits) {
+			if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
+				continue;
+			__set_bit(i, limit_map);
+		}
+
+		if (!PageHighMem(page))
+			scrub_pages(page_address(page), 1);
+#ifdef CONFIG_XEN_SCRUB_PAGES
+		else {
+			scrub_pages(kmap(page), 1);
+			kunmap(page);
+			++n;
+		}
+#endif
+	}
+	if (bitmap_empty(limit_map, 1U << order)) {
+		if (limit_map != &_limit_map)
+			kfree(limit_map);
+		return 0;
+	}
+
+	if (n)
+		kmap_flush_unused();
+
+	balloon_lock(flags);
+
+	in_frames = discontig_frames;
+	set_xen_guest_handle(exchange.in.extent_start, in_frames);
+	out_frames = in_frames + (1U << order);
+	set_xen_guest_handle(exchange.out.extent_start, out_frames);
+
+	/* 1. Zap current PTEs (if any), remembering MFNs. */
+	for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
+		if(!test_bit(i, limit_map))
+			continue;
+		page = &pages[i];
+
+		out_frames[n] = page_to_pfn(page);
+		in_frames[n] = pfn_to_mfn(out_frames[n]);
+
+		if (!PageHighMem(page))
+			MULTI_update_va_mapping(cr_mcl + nr_mcl++,
+						(unsigned long)page_address(page),
+						__pte_ma(0), 0);
+
+		set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
+		++n;
+	}
+	if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
+		BUG();
+
+	/* 2. Get new memory below the required limit. */
+	exchange.in.nr_extents = n;
+	exchange.out.nr_extents = n;
+	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+	success = (exchange.nr_exchanged == n);
+	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+	BUG_ON(success && (rc != 0));
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (unlikely(rc == -ENOSYS)) {
+		/* Compatibility when XENMEM_exchange is unsupported. */
+		if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+					 &exchange.in) != n)
+			BUG();
+		if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+					 &exchange.out) != n)
+			BUG();
+		success = 1;
+	}
+#endif
+
+	/* 3. Map the new pages in place of old pages. */
+	for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
+		if(!test_bit(i, limit_map))
+			continue;
+		page = &pages[i];
+
+		frame = success ? out_frames[n] : in_frames[n];
+
+		if (!PageHighMem(page))
+			MULTI_update_va_mapping(cr_mcl + nr_mcl++,
+						(unsigned long)page_address(page),
+						pfn_pte_ma(frame, PAGE_KERNEL), 0);
+
+		set_phys_to_machine(page_to_pfn(page), frame);
+		++n;
+	}
+	if (nr_mcl) {
+		cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
+							        ? UVMF_TLB_FLUSH|UVMF_ALL
+							        : UVMF_INVLPG|UVMF_ALL;
+		if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
+			BUG();
+	}
+
+	balloon_unlock(flags);
+
+	if (success && address_bits) {
+		if (order) {
+			BUILD_BUG_ON(sizeof(*limit_map) != sizeof(pages->index));
+			for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
+				pages[i + 1].index = limit_map[i];
+		}
+		SetPageForeign(pages, undo_limit_pages);
+	}
+
+	if (limit_map != &_limit_map)
+		kfree(limit_map);
+
+	return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
+
+bool hypervisor_oom(void)
+{
+	WARN_ONCE(1, "Hypervisor is out of memory");
+	return false;//temp
+}
+
+int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
+			  void *arg, int (*func)(unsigned long, unsigned long,
+						 void *))
+{
+	return start_pfn < max_pfn && nr_pages
+	       ? func(start_pfn, min(max_pfn - start_pfn, nr_pages), arg)
+	       : -1;
+}
+
+int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
+{
+	maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
+	return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
+}
+
+int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
+		    int type)
+{
+	maddr_t mach_gp = arbitrary_virt_to_machine(gdt + entry);
+	return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
+}
diff --git a/arch/x86/mm/init-xen.c b/arch/x86/mm/init-xen.c
new file mode 100644
index 0000000..4d7e3ad
--- /dev/null
+++ b/arch/x86/mm/init-xen.c
@@ -0,0 +1,503 @@
+#include <linux/gfp.h>
+#include <linux/initrd.h>
+#include <linux/ioport.h>
+#include <linux/swap.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
+#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/system.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/proto.h>
+#include <asm/dma.h>		/* for MAX_DMA_PFN */
+
+unsigned long __meminitdata pgt_buf_start;
+unsigned long __meminitdata pgt_buf_end;
+unsigned long __meminitdata pgt_buf_top;
+
+int after_bootmem;
+
+#if !defined(CONFIG_XEN)
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+				= 1
+#endif
+;
+#elif defined(CONFIG_X86_32)
+#define direct_gbpages 0
+extern unsigned long extend_init_mapping(unsigned long tables_space);
+#else
+extern void xen_finish_init_mapping(void);
+#endif
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+					  int use_gbpages)
+{
+	unsigned long puds, pmds, ptes, tables;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+	if (use_gbpages) {
+		unsigned long extra;
+
+		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+	} else
+		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+	if (use_pse) {
+		unsigned long extra;
+
+		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+		extra += PMD_SIZE;
+#endif
+		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	} else
+		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+	/* for fixmap */
+	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+
+	pgt_buf_start = extend_init_mapping(tables);
+	pgt_buf_end = pgt_buf_start;
+#else /* CONFIG_X86_64 */
+	if (!pgt_buf_top) {
+		pgt_buf_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
+			xen_start_info->nr_pt_frames;
+		pgt_buf_end = pgt_buf_start;
+	} else {
+		/*
+		 * [table_start, table_top) gets passed to reserve_early(),
+		 * so we must not use table_end here, despite continuing
+		 * to allocate from there. table_end possibly being below
+		 * table_start is otoh not a problem.
+		 */
+		pgt_buf_start = pgt_buf_top;
+	}
+#endif
+	if (pgt_buf_start == -1UL)
+		panic("Cannot find space for the kernel page tables");
+
+	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+
+	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
+}
+
+void __init xen_pagetable_reserve(u64 start, u64 end)
+{
+	if (end > start)
+		memblock_reserve(start, end - start);
+}
+
+struct map_range {
+	unsigned long start;
+	unsigned long end;
+	unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+			     unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned long page_size_mask)
+{
+	if (start_pfn < end_pfn) {
+		if (nr_range >= NR_RANGE_MR)
+			panic("run out of range for init_memory_mapping\n");
+		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+		mr[nr_range].page_size_mask = page_size_mask;
+		nr_range++;
+	}
+
+	return nr_range;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+					       unsigned long end)
+{
+	unsigned long page_size_mask = 0;
+	unsigned long start_pfn, end_pfn;
+	unsigned long ret = 0;
+	unsigned long pos;
+
+	struct map_range mr[NR_RANGE_MR];
+	int nr_range, i;
+	int use_pse, use_gbpages;
+
+	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
+	/*
+	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+	 * This will simplify cpa(), which otherwise needs to support splitting
+	 * large pages into small in interrupt context, etc.
+	 */
+	use_pse = use_gbpages = 0;
+#else
+	use_pse = cpu_has_pse;
+	use_gbpages = direct_gbpages;
+#endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse)
+		set_in_cr4(X86_CR4_PSE);
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		set_in_cr4(X86_CR4_PGE);
+		__supported_pte_mask |= _PAGE_GLOBAL;
+	}
+
+	if (use_gbpages)
+		page_size_mask |= 1 << PG_LEVEL_1G;
+	if (use_pse)
+		page_size_mask |= 1 << PG_LEVEL_2M;
+
+	memset(mr, 0, sizeof(mr));
+	nr_range = 0;
+
+	/* head if not big page alignment ? */
+	start_pfn = start >> PAGE_SHIFT;
+	pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+	/*
+	 * Don't use a large page for the first 2/4MB of memory
+	 * because there are often fixed size MTRRs in there
+	 * and overlapping MTRRs into large pages can cause
+	 * slowdowns.
+	 */
+	if (pos == 0)
+		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+	else
+		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+				 << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+			<< (PMD_SHIFT - PAGE_SHIFT);
+#endif
+	if (end_pfn > (end >> PAGE_SHIFT))
+		end_pfn = end >> PAGE_SHIFT;
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* big page (2M) range */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+#ifdef CONFIG_X86_64
+	/* big page (1G) range */
+	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask &
+				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* tail is not big page (1G) alignment */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+#endif
+
+	/* tail is not big page (2M) alignment */
+	start_pfn = pos>>PAGE_SHIFT;
+	end_pfn = end>>PAGE_SHIFT;
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+	/* try to merge same page size and continuous */
+	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+		unsigned long old_start;
+		if (mr[i].end != mr[i+1].start ||
+		    mr[i].page_size_mask != mr[i+1].page_size_mask)
+			continue;
+		/* move it */
+		old_start = mr[i].start;
+		memmove(&mr[i], &mr[i+1],
+			(nr_range - 1 - i) * sizeof(struct map_range));
+		mr[i--].start = old_start;
+		nr_range--;
+	}
+
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+				mr[i].start, mr[i].end,
+			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+	/*
+	 * Find space for the kernel direct mapping tables.
+	 *
+	 * Later we should allocate these tables in the local node of the
+	 * memory mapped. Unfortunately this is done currently before the
+	 * nodes are discovered.
+	 */
+	if (!after_bootmem)
+		find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_64
+#define addr_to_page(addr)						\
+	((unsigned long *)						\
+	 ((mfn_to_pfn(((addr) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)	\
+	   << PAGE_SHIFT) + __START_KERNEL_map))
+
+	if (!start) {
+		unsigned long addr, va = __START_KERNEL_map;
+		unsigned long *page = (unsigned long *)init_level4_pgt;
+
+		/* Kill mapping of memory below _text. */
+		while (va < (unsigned long)&_text) {
+			if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+				BUG();
+			va += PAGE_SIZE;
+		}
+
+		/* Blow away any spurious initial mappings. */
+		va = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
+
+		addr = page[pgd_index(va)];
+		page = addr_to_page(addr);
+		addr = page[pud_index(va)];
+		page = addr_to_page(addr);
+		while (pmd_index(va) | pte_index(va)) {
+			if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
+				break;
+			if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+				BUG();
+			va += PAGE_SIZE;
+		}
+	}
+#undef addr_to_page
+#endif
+
+	for (i = 0; i < nr_range; i++)
+		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+						   mr[i].page_size_mask);
+
+#ifdef CONFIG_X86_32
+	early_ioremap_page_table_range_init();
+#endif
+
+#ifdef CONFIG_X86_64
+	BUG_ON(pgt_buf_end > pgt_buf_top);
+	if (!start)
+		xen_finish_init_mapping();
+	else
+#endif
+	if (pgt_buf_end < pgt_buf_top)
+		/* Disable the 'table_end' allocator. */
+		pgt_buf_top = pgt_buf_end;
+
+	__flush_tlb_all();
+
+	/*
+	 * Reserve the kernel pagetable pages we used (pgt_buf_start -
+	 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
+	 * so that they can be reused for other purposes.
+	 *
+	 * On native it just means calling memblock_reserve, on Xen it also
+	 * means marking RW the pagetable pages that we allocated before
+	 * but that haven't been used.
+	 *
+	 * In fact on xen we mark RO the whole range pgt_buf_start -
+	 * pgt_buf_top, because we have to make sure that when
+	 * init_memory_mapping reaches the pagetable pages area, it maps
+	 * RO all the pagetable pages, including the ones that are beyond
+	 * pgt_buf_end at that time.
+	 */
+	if (!after_bootmem && pgt_buf_top > pgt_buf_start) {
+#ifdef CONFIG_X86_64
+		reserve_pgtable_low();
+#endif
+		x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+				PFN_PHYS(pgt_buf_top));
+	}
+
+	if (!after_bootmem)
+		early_memtest(start, end);
+
+	return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+		return 0;
+	if (mfn_to_local_pfn(pagenr) >= max_pfn)
+		return 1;
+	return 0;
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+	unsigned long addr;
+	unsigned long begin_aligned, end_aligned;
+
+	/* Make sure boundaries are page aligned */
+	begin_aligned = PAGE_ALIGN(begin);
+	end_aligned   = end & PAGE_MASK;
+
+	if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
+		begin = begin_aligned;
+		end   = end_aligned;
+	}
+
+	if (begin >= end)
+		return;
+
+	addr = begin;
+
+	/*
+	 * If debugging page accesses then do not free this memory but
+	 * mark them not present - any buggy init-section access will
+	 * create a kernel page fault:
+	 */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+		begin, end);
+	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+	/*
+	 * We just marked the kernel text read only above, now that
+	 * we are going to free part of that, we need to make that
+	 * writeable and non-executable first.
+	 */
+	set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
+	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+
+	for (; addr < end; addr += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(addr));
+		init_page_count(virt_to_page(addr));
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
+#ifdef CONFIG_X86_64
+		if (addr >= __START_KERNEL_map) {
+			/* make_readonly() reports all kernel addresses. */
+			if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
+							 pfn_pte(__pa(addr) >> PAGE_SHIFT,
+								 PAGE_KERNEL),
+							 0))
+				BUG();
+			if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+				BUG();
+		}
+#endif
+		free_page(addr);
+		totalram_pages++;
+	}
+#endif
+}
+
+void free_initmem(void)
+{
+	free_init_pages("unused kernel memory",
+			(unsigned long)(&__init_begin),
+			(unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	/*
+	 * end could be not aligned, and We can not align that,
+	 * decompresser could be confused by aligned initrd_end
+	 * We already reserve the end partial page before in
+	 *   - i386_start_kernel()
+	 *   - x86_64_start_kernel()
+	 *   - relocate_initrd()
+	 * So here We can do PAGE_ALIGN() safely to get partial page to be freed
+	 */
+#ifdef CONFIG_ACPI_INITRD_TABLE_OVERRIDE
+	if (acpi_initrd_offset)
+		free_init_pages("initrd memory", start - acpi_initrd_offset,
+				PAGE_ALIGN(end));
+	else
+#endif
+	free_init_pages("initrd memory", start, PAGE_ALIGN(end));
+}
+#endif
+
+void __init zone_sizes_init(void)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+	max_zone_pfns[ZONE_DMA]		= MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+	max_zone_pfns[ZONE_DMA32]	= MAX_DMA32_PFN;
+#endif
+	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM]	= max_pfn;
+#endif
+
+	free_area_init_nodes(max_zone_pfns);
+
+	xen_init_pgd_pin();
+}
+
diff --git a/arch/x86/mm/init_32-xen.c b/arch/x86/mm/init_32-xen.c
new file mode 100644
index 0000000..16a7434
--- /dev/null
+++ b/arch/x86/mm/init_32-xen.c
@@ -0,0 +1,1019 @@
+/*
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/memory_hotplug.h>
+#include <linux/initrd.h>
+#include <linux/cpumask.h>
+#include <linux/gfp.h>
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+
+#include <asm/asm.h>
+#include <asm/bios_ebda.h>
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/bugs.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+#include <asm/hypervisor.h>
+#include <asm/swiotlb.h>
+#include <asm/setup.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/init.h>
+
+unsigned long highstart_pfn, highend_pfn;
+
+static noinline int do_test_wp_bit(void);
+
+bool __read_mostly __vmalloc_start_set = false;
+
+static __init void *alloc_low_page(void)
+{
+	unsigned long pfn = pgt_buf_end++;
+	void *adr;
+
+	if (pfn >= pgt_buf_top)
+		panic("alloc_low_page: ran out of memory");
+
+	adr = __va(pfn * PAGE_SIZE);
+	clear_page(adr);
+	return adr;
+}
+
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+	pud_t *pud;
+	pmd_t *pmd_table;
+
+#ifdef CONFIG_X86_PAE
+	if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
+		if (after_bootmem)
+			pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
+		else
+			pmd_table = (pmd_t *)alloc_low_page();
+		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+		make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
+		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+		pud = pud_offset(pgd, 0);
+		BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+		return pmd_table;
+	}
+#endif
+	pud = pud_offset(pgd, 0);
+	pmd_table = pmd_offset(pud, 0);
+
+	return pmd_table;
+}
+
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry:
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+	if (pmd_none(*pmd)) {
+#else
+	if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
+#endif
+		pte_t *page_table = NULL;
+
+		if (after_bootmem) {
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
+			page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+#endif
+			if (!page_table)
+				page_table =
+				(pte_t *)alloc_bootmem_pages(PAGE_SIZE);
+		} else
+			page_table = (pte_t *)alloc_low_page();
+
+		paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+		make_lowmem_page_readonly(page_table,
+					  XENFEAT_writable_page_tables);
+		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+	}
+
+	return pte_offset_kernel(pmd, 0);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	int pgd_idx = pgd_index(vaddr);
+	int pmd_idx = pmd_index(vaddr);
+
+	return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	int pte_idx = pte_index(vaddr);
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return one_page_table_init(pmd) + pte_idx;
+}
+
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+					   unsigned long vaddr, pte_t *lastpte)
+{
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * Something (early fixmap) may already have put a pte
+	 * page here, which causes the page table allocation
+	 * to become nonlinear. Attempt to fix it, and if it
+	 * is still nonlinear then we have to bug.
+	 */
+	int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+	int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
+	    && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
+		|| (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+		pte_t *newpte;
+		int i;
+
+		BUG_ON(after_bootmem);
+		newpte = alloc_low_page();
+		for (i = 0; i < PTRS_PER_PTE; i++)
+			set_pte(newpte + i, pte[i]);
+
+		paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+		make_lowmem_page_readonly(newpte,
+					  XENFEAT_writable_page_tables);
+		set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+		BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+		__flush_tlb_all();
+
+		paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+		make_lowmem_page_writable(pte,
+					  XENFEAT_writable_page_tables);
+		pte = newpte;
+	}
+	BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+	       && vaddr > fix_to_virt(FIX_KMAP_END)
+	       && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+	return pte;
+}
+
+/*
+ * This function initializes a certain range of kernel virtual memory
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
+ * checking the pgd every time.
+ */
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+	int pgd_idx, pmd_idx;
+	unsigned long vaddr;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+
+	vaddr = start;
+	pgd_idx = pgd_index(vaddr);
+	pmd_idx = pmd_index(vaddr);
+	pgd = pgd_base + pgd_idx;
+
+	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+		pmd = one_md_table_init(pgd);
+		pmd = pmd + pmd_index(vaddr);
+		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+							pmd++, pmd_idx++) {
+			if (vaddr >= hypervisor_virt_start)
+				break;
+			pte = page_table_kmap_check(one_page_table_init(pmd),
+			                            pmd, vaddr, pte);
+
+			vaddr += PMD_SIZE;
+		}
+		pmd_idx = 0;
+	}
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+	if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
+		return 1;
+	return 0;
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
+ */
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
+{
+	int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+	unsigned long last_map_addr = end;
+	unsigned long start_pfn, end_pfn;
+	pgd_t *pgd_base = swapper_pg_dir;
+	int pgd_idx, pmd_idx, pte_ofs;
+	unsigned long pfn;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned pages_2m, pages_4k;
+	int mapping_iter;
+
+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = end >> PAGE_SHIFT;
+
+	/*
+	 * First iteration will setup identity mapping using large/small pages
+	 * based on use_pse, with other attributes same as set by
+	 * the early code in head_32.S
+	 *
+	 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+	 * as desired for the kernel identity mapping.
+	 *
+	 * This two pass mechanism conforms to the TLB app note which says:
+	 *
+	 *     "Software should not write to a paging-structure entry in a way
+	 *      that would change, for any linear address, both the page size
+	 *      and either the page frame or attributes."
+	 */
+	mapping_iter = 1;
+
+	if (!cpu_has_pse) {
+		use_pse = 0;
+		mapping_iter = 0;
+	}
+
+repeat:
+	pages_2m = pages_4k = 0;
+	pfn = start_pfn;
+	pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+	pgd = pgd_base + pgd_idx;
+	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+#ifdef CONFIG_XEN
+		/*
+		 * Native linux hasn't PAE-paging enabled yet at this
+		 * point.  When running as xen domain we are in PAE
+		 * mode already, thus we can't simply hook a empty
+		 * pmd.  That would kill the mappings we are currently
+		 * using ...
+		 */
+		pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
+#else
+		pmd = one_md_table_init(pgd);
+#endif
+
+		if (pfn >= end_pfn)
+			continue;
+#ifdef CONFIG_X86_PAE
+		pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+		pmd += pmd_idx;
+#else
+		pmd_idx = 0;
+#endif
+		for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
+		     pmd++, pmd_idx++) {
+			unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+			if (addr >= hypervisor_virt_start)
+				continue;
+
+			/*
+			 * Map with big pages if possible, otherwise
+			 * create normal page tables:
+			 */
+			if (use_pse) {
+				unsigned int addr2;
+				pgprot_t prot = PAGE_KERNEL_LARGE;
+				/*
+				 * first pass will use the same initial
+				 * identity mapping attribute + _PAGE_PSE.
+				 */
+				pgprot_t init_prot =
+					__pgprot(PTE_IDENT_ATTR |
+						 _PAGE_PSE);
+
+				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+					PAGE_OFFSET + PAGE_SIZE-1;
+
+				if (is_kernel_text(addr) ||
+				    is_kernel_text(addr2))
+					prot = PAGE_KERNEL_LARGE_EXEC;
+
+				pages_2m++;
+				if (mapping_iter == 1)
+					set_pmd(pmd, pfn_pmd(pfn, init_prot));
+				else
+					set_pmd(pmd, pfn_pmd(pfn, prot));
+
+				pfn += PTRS_PER_PTE;
+				continue;
+			}
+			pte = one_page_table_init(pmd);
+
+			pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+			pte += pte_ofs;
+			for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
+			     pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+				pgprot_t prot = PAGE_KERNEL;
+				/*
+				 * first pass will use the same initial
+				 * identity mapping attribute.
+				 */
+				pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
+
+				/* XEN: Only map initial RAM allocation. */
+				if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
+					continue;
+				if (is_kernel_text(addr))
+					prot = PAGE_KERNEL_EXEC;
+
+				pages_4k++;
+				if (mapping_iter == 1) {
+					set_pte(pte, pfn_pte(pfn, init_prot));
+					last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
+				} else
+					set_pte(pte, pfn_pte(pfn, prot));
+			}
+		}
+	}
+	if (mapping_iter <= 1) {
+		/*
+		 * update direct mapping page count only in the first
+		 * iteration.
+		 */
+		update_page_count(PG_LEVEL_2M, pages_2m);
+		update_page_count(PG_LEVEL_4K, pages_4k);
+	}
+	if (mapping_iter == 1) {
+		/*
+		 * local global flush tlb, which will flush the previous
+		 * mappings present in both small and large page TLB's.
+		 */
+		__flush_tlb_all();
+
+		/*
+		 * Second iteration will set the actual desired PTE attributes.
+		 */
+		mapping_iter = 2;
+		goto repeat;
+	}
+	return last_map_addr;
+}
+
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+			vaddr), vaddr), vaddr);
+}
+
+static void __init kmap_init(void)
+{
+	unsigned long kmap_vstart;
+
+	/*
+	 * Cache the first kmap pte:
+	 */
+	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+	kmap_prot = PAGE_KERNEL;
+}
+
+#ifdef CONFIG_HIGHMEM
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+	unsigned long vaddr;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	vaddr = PKMAP_BASE;
+	page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	pud = pud_offset(pgd, vaddr);
+	pmd = pmd_offset(pud, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
+	pkmap_page_table = pte;
+}
+
+static void __init add_one_highpage_init(struct page *page)
+{
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+	totalhigh_pages++;
+}
+
+void __init add_highpages_with_active_regions(int nid,
+			 unsigned long start_pfn, unsigned long end_pfn)
+{
+	phys_addr_t start, end;
+	u64 i;
+
+	for_each_free_mem_range(i, nid, &start, &end, NULL) {
+		unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
+					    start_pfn, end_pfn);
+		unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
+					      start_pfn, end_pfn);
+		for ( ; pfn < e_pfn; pfn++)
+			if (pfn_valid(pfn))
+				add_one_highpage_init(pfn_to_page(pfn));
+	}
+}
+#else
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
+{
+}
+#endif /* CONFIG_HIGHMEM */
+
+pgd_t *swapper_pg_dir;
+
+/*
+ * Build a proper pagetable for the kernel mappings.  Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/x86/kernel/head_32.S.  The root of the
+ * pagetable will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir.  In any case,
+ * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+void __init early_ioremap_page_table_range_init(void)
+{
+	pgd_t *pgd_base = swapper_pg_dir;
+	unsigned long vaddr, end;
+
+	/*
+	 * Fixed mappings, only the page table structure has to be
+	 * created - mappings will be set by set_fixmap():
+	 */
+	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+	page_table_range_init(vaddr, end, pgd_base);
+	early_ioremap_reset();
+}
+
+static void __init pagetable_init(void)
+{
+	pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
+
+	permanent_kmaps_init(pgd_base);
+}
+
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+	return 0;
+}
+early_param("highmem", parse_highmem);
+
+#define MSG_HIGHMEM_TOO_BIG \
+	"highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+
+#define MSG_LOWMEM_TOO_SMALL \
+	"highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
+/*
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
+ */
+void __init lowmem_pfn_init(void)
+{
+	/* max_low_pfn is 0, we already have early_res support */
+	max_low_pfn = max_pfn;
+
+	if (highmem_pages == -1)
+		highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+	if (highmem_pages >= max_pfn) {
+		printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
+			pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+		highmem_pages = 0;
+	}
+	if (highmem_pages) {
+		if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+			printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
+				pages_to_mb(highmem_pages));
+			highmem_pages = 0;
+		}
+		max_low_pfn -= highmem_pages;
+	}
+#else
+	if (highmem_pages)
+		printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+
+#define MSG_HIGHMEM_TOO_SMALL \
+	"only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+
+#define MSG_HIGHMEM_TRIMMED \
+	"Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+void __init highmem_pfn_init(void)
+{
+	max_low_pfn = MAXMEM_PFN;
+
+	if (highmem_pages == -1)
+		highmem_pages = max_pfn - MAXMEM_PFN;
+
+	if (highmem_pages + MAXMEM_PFN < max_pfn)
+		max_pfn = MAXMEM_PFN + highmem_pages;
+
+	if (highmem_pages + MAXMEM_PFN > max_pfn) {
+		printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+			pages_to_mb(max_pfn - MAXMEM_PFN),
+			pages_to_mb(highmem_pages));
+		highmem_pages = 0;
+	}
+#ifndef CONFIG_HIGHMEM
+	/* Maximum memory usable is what is directly addressable */
+	printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
+	if (max_pfn > MAX_NONPAE_PFN)
+		printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+	else
+		printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+	max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+	if (max_pfn > MAX_NONPAE_PFN) {
+		max_pfn = MAX_NONPAE_PFN;
+		printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
+	}
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+}
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+	/* it could update max_pfn */
+
+	if (max_pfn <= MAXMEM_PFN)
+		lowmem_pfn_init();
+	else
+		highmem_pfn_init();
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+	highstart_pfn = highend_pfn = max_pfn;
+	if (max_pfn > max_low_pfn)
+		highstart_pfn = max_low_pfn;
+	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+		pages_to_mb(highend_pfn - highstart_pfn));
+	num_physpages = highend_pfn;
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+	num_physpages = max_low_pfn;
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+	sparse_memory_present_with_active_regions(0);
+
+#ifdef CONFIG_FLATMEM
+	max_mapnr = num_physpages;
+#endif
+	__vmalloc_start_set = true;
+
+	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+			pages_to_mb(max_low_pfn));
+
+	setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+void __init setup_bootmem_allocator(void)
+{
+	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+		 max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
+
+	after_bootmem = 1;
+}
+
+unsigned long __init extend_init_mapping(unsigned long tables_space)
+{
+	unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
+				  + xen_start_info->nr_pt_frames;
+	unsigned long start = start_pfn, va = (unsigned long)&_text;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	/* Ensure init mappings cover kernel text/data and initial tables. */
+	while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
+		pgd = pgd_offset_k(va);
+		pud = pud_offset(pgd, va);
+		pmd = pmd_offset(pud, va);
+		if (pmd_none(*pmd)) {
+			unsigned long pa = start_pfn++ << PAGE_SHIFT;
+
+			clear_page(__va(pa));
+			make_lowmem_page_readonly(__va(pa),
+						  XENFEAT_writable_page_tables);
+			xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
+		}
+		pte = pte_offset_kernel(pmd, va);
+		if (pte_none(*pte)) {
+			pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
+
+			if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
+				BUG();
+		}
+		va += PAGE_SIZE;
+	}
+
+	/* Finally, blow away any spurious initial mappings. */
+	while (1) {
+		pgd = pgd_offset_k(va);
+		pud = pud_offset(pgd, va);
+		pmd = pmd_offset(pud, va);
+		if (pmd_none(*pmd))
+			break;
+		if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+			BUG();
+		va += PAGE_SIZE;
+	}
+
+	if (start_pfn > start)
+		memblock_reserve(PFN_PHYS(start), PFN_PHYS(start_pfn - start));
+
+	return start_pfn;
+}
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+	pagetable_init();
+
+	__flush_tlb_all();
+
+	kmap_init();
+
+	/*
+	 * NOTE: at this point the bootmem allocator is fully available.
+	 */
+	olpc_dt_build_devicetree();
+	sparse_memory_present_with_active_regions(MAX_NUMNODES);
+	sparse_init();
+	zone_sizes_init();
+}
+
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
+ */
+static void __init test_wp_bit(void)
+{
+	printk(KERN_INFO
+  "Checking if this processor honours the WP bit even in supervisor mode...");
+
+	/* Any page-aligned address will do, the test is non-destructive */
+	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+	boot_cpu_data.wp_works_ok = do_test_wp_bit();
+	clear_fixmap(FIX_WP_TEST);
+
+	if (!boot_cpu_data.wp_works_ok) {
+		printk(KERN_CONT "No.\n");
+#ifdef CONFIG_X86_WP_WORKS_OK
+		panic(
+  "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+#endif
+	} else {
+		printk(KERN_CONT "Ok.\n");
+	}
+}
+
+void __init mem_init(void)
+{
+	int codesize, reservedpages, datasize, initsize;
+	int tmp;
+	unsigned long pfn;
+
+	pci_iommu_alloc();
+
+#ifdef CONFIG_FLATMEM
+	BUG_ON(!mem_map);
+#endif
+	/*
+	 * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+	 * be done before free_all_bootmem(). Memblock use free low memory for
+	 * temporary data (see find_range_array()) and for this purpose can use
+	 * pages that was already passed to the buddy allocator, hence marked as
+	 * not accessible in the page tables when compiled with
+	 * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+	 * important here.
+	 */
+	set_highmem_pages_init();
+
+	/* this will put all low memory onto the freelists */
+	totalram_pages += free_all_bootmem();
+	/* XEN: init low-mem pages outside initial allocation. */
+	for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
+		ClearPageReserved(pfn_to_page(pfn));
+		init_page_count(pfn_to_page(pfn));
+	}
+
+	reservedpages = 0;
+	for (tmp = 0; tmp < max_low_pfn; tmp++)
+		/*
+		 * Only count reserved RAM pages:
+		 */
+		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+			reservedpages++;
+
+	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
+			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
+		nr_free_pages() << (PAGE_SHIFT-10),
+		num_physpages << (PAGE_SHIFT-10),
+		codesize >> 10,
+		reservedpages << (PAGE_SHIFT-10),
+		datasize >> 10,
+		initsize >> 10,
+		totalhigh_pages << (PAGE_SHIFT-10));
+
+	printk(KERN_INFO "virtual kernel memory layout:\n"
+		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#endif
+		"    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+		"    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+		"      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+		FIXADDR_START, FIXADDR_TOP,
+		(FIXADDR_TOP - FIXADDR_START) >> 10,
+
+#ifdef CONFIG_HIGHMEM
+		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+		(LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+		VMALLOC_START, VMALLOC_END,
+		(VMALLOC_END - VMALLOC_START) >> 20,
+
+		(unsigned long)__va(0), (unsigned long)high_memory,
+		((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+		(unsigned long)&__init_begin, (unsigned long)&__init_end,
+		((unsigned long)&__init_end -
+		 (unsigned long)&__init_begin) >> 10,
+
+		(unsigned long)&_etext, (unsigned long)&_edata,
+		((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+		(unsigned long)&_text, (unsigned long)&_etext,
+		((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+	/*
+	 * Check boundaries twice: Some fundamental inconsistencies can
+	 * be detected at build time already.
+	 */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+	BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
+	BUILD_BUG_ON(VMALLOC_END			> PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+	BUILD_BUG_ON(VMALLOC_START			>= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+
+#ifdef CONFIG_HIGHMEM
+	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
+	BUG_ON(VMALLOC_END				> PKMAP_BASE);
+#endif
+	BUG_ON(VMALLOC_START				>= VMALLOC_END);
+	BUG_ON((unsigned long)high_memory		> VMALLOC_START);
+
+	if (boot_cpu_data.wp_works_ok < 0)
+		test_wp_bit();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+	struct pglist_data *pgdata = NODE_DATA(nid);
+	struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	return __add_pages(nid, zone, start_pfn, nr_pages);
+}
+#endif
+
+/*
+ * This function cannot be __init, since exceptions don't work in that
+ * section.  Put this after the callers, so that it cannot be inlined.
+ */
+static noinline int do_test_wp_bit(void)
+{
+	char tmp_reg;
+	int flag;
+
+	__asm__ __volatile__(
+		"	movb %0, %1	\n"
+		"1:	movb %1, %0	\n"
+		"	xorl %2, %2	\n"
+		"2:			\n"
+		_ASM_EXTABLE(1b,2b)
+		:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+		 "=q" (tmp_reg),
+		 "=r" (flag)
+		:"2" (1)
+		:"memory");
+
+	return flag;
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+int kernel_set_to_readonly __read_mostly;
+
+void set_kernel_text_rw(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read write\n",
+		 start, start+size);
+
+	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read only\n",
+		 start, start+size);
+
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+static void mark_nxdata_nx(void)
+{
+	/*
+	 * When this called, init has already been executed and released,
+	 * so everything past _etext should be NX.
+	 */
+	unsigned long start = PFN_ALIGN(_etext);
+	/*
+	 * This comes from is_kernel_text upper limit. Also HPAGE where used:
+	 */
+	unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+	if (__supported_pte_mask & _PAGE_NX)
+		printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+	set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+		size >> 10);
+
+	kernel_set_to_readonly = 1;
+
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+		start, start+size);
+	set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+	printk(KERN_INFO "Testing CPA: write protecting again\n");
+	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
+
+	start += size;
+	size = (unsigned long)__end_rodata - start;
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+		size >> 10);
+	rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Testing CPA: write protecting again\n");
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
+	mark_nxdata_nx();
+}
+#endif
+
diff --git a/arch/x86/mm/init_64-xen.c b/arch/x86/mm/init_64-xen.c
new file mode 100644
index 0000000..ffad355
--- /dev/null
+++ b/arch/x86/mm/init_64-xen.c
@@ -0,0 +1,1379 @@
+/*
+ *  linux/arch/x86_64/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
+ *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ *
+ *  Jun Nakajima <jun.nakajima@intel.com>
+ *	Modified for Xen.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nmi.h>
+#include <linux/gfp.h>
+
+#include <asm/processor.h>
+#include <asm/bios_ebda.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
+#include <asm/cacheflush.h>
+#include <asm/init.h>
+#include <asm/setup.h>
+
+#include <xen/features.h>
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+unsigned int __kernel_page_user;
+EXPORT_SYMBOL(__kernel_page_user);
+#endif
+
+extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
+extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+
+/*
+ * Use this until direct mapping is established, i.e. before __va() is 
+ * available in init_memory_mapping().
+ */
+
+#define addr_to_page(addr, page)				\
+	(addr) &= PHYSICAL_PAGE_MASK;				\
+	(page) = ((unsigned long *) ((unsigned long)		\
+	(((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +	\
+	__START_KERNEL_map)))
+
+pmd_t *__init early_get_pmd(unsigned long va)
+{
+	unsigned long addr;
+	unsigned long *page = (unsigned long *)init_level4_pgt;
+
+	addr = page[pgd_index(va)];
+	addr_to_page(addr, page);
+
+	addr = page[pud_index(va)];
+	addr_to_page(addr, page);
+
+	return (pmd_t *)&page[pmd_index(va)];
+}
+
+void __meminit early_make_page_readonly(void *va, unsigned int feature)
+{
+	unsigned long addr, _va = (unsigned long)va;
+	pte_t pte, *ptep;
+	unsigned long *page = (unsigned long *) init_level4_pgt;
+
+	BUG_ON(after_bootmem);
+
+	if (xen_feature(feature))
+		return;
+
+	addr = (unsigned long) page[pgd_index(_va)];
+	addr_to_page(addr, page);
+
+	addr = page[pud_index(_va)];
+	addr_to_page(addr, page);
+
+	addr = page[pmd_index(_va)];
+	addr_to_page(addr, page);
+
+	ptep = (pte_t *) &page[pte_index(_va)];
+
+	pte.pte = ptep->pte & ~_PAGE_RW;
+	if (HYPERVISOR_update_va_mapping(_va, pte, 0))
+		BUG();
+}
+
+unsigned long __init early_arbitrary_virt_to_mfn(void *v)
+{
+	unsigned long va = (unsigned long)v, addr, *page;
+
+	BUG_ON(va < __START_KERNEL_map);
+
+	page = (void *)(xen_read_cr3() + __START_KERNEL_map);
+
+	addr = page[pgd_index(va)];
+	addr_to_page(addr, page);
+
+	addr = page[pud_index(va)];
+	addr_to_page(addr, page);
+
+	addr = page[pmd_index(va)];
+	addr_to_page(addr, page);
+
+	return (page[pte_index(va)] & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT;
+}
+
+#ifndef CONFIG_XEN
+static int __init parse_direct_gbpages_off(char *arg)
+{
+	direct_gbpages = 0;
+	return 0;
+}
+early_param("nogbpages", parse_direct_gbpages_off);
+
+static int __init parse_direct_gbpages_on(char *arg)
+{
+	direct_gbpages = 1;
+	return 0;
+}
+early_param("gbpages", parse_direct_gbpages_on);
+#endif
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+pteval_t __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+int force_personality32;
+
+/*
+ * noexec32=on|off
+ * Control non executable heap for 32bit processes.
+ * To control the stack too use noexec=off
+ *
+ * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
+ * off	PROT_READ implies PROT_EXEC
+ */
+static int __init nonx32_setup(char *str)
+{
+	if (!strcmp(str, "on"))
+		force_personality32 &= ~READ_IMPLIES_EXEC;
+	else if (!strcmp(str, "off"))
+		force_personality32 |= READ_IMPLIES_EXEC;
+	return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+/*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+	unsigned long address;
+
+	for (address = start; address <= end; address += PGDIR_SIZE) {
+		const pgd_t *pgd_ref = pgd_offset_k(address);
+		struct page *page;
+
+		if (pgd_none(*pgd_ref))
+			continue;
+
+		spin_lock(&pgd_lock);
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			/* the pgt_lock only for Xen */
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
+			if (pgd_none(*pgd))
+				set_pgd(pgd, *pgd_ref);
+			else
+				BUG_ON(pgd_page_vaddr(*pgd)
+				       != pgd_page_vaddr(*pgd_ref));
+
+			spin_unlock(pgt_lock);
+		}
+		spin_unlock(&pgd_lock);
+	}
+}
+
+static struct reserved_pfn_range {
+	unsigned long pfn, nr;
+} reserved_pfn_ranges[3] __meminitdata;
+
+void __init reserve_pfn_range(unsigned long pfn, unsigned long nr)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+		struct reserved_pfn_range *range = reserved_pfn_ranges + i;
+
+		if (!range->nr) {
+			range->pfn = pfn;
+			range->nr = nr;
+			break;
+		}
+		BUG_ON(range->pfn < pfn + nr && pfn < range->pfn + range->nr);
+		if (range->pfn > pfn) {
+			i = ARRAY_SIZE(reserved_pfn_ranges) - 1;
+			if (reserved_pfn_ranges[i].nr)
+				continue;
+			for (; reserved_pfn_ranges + i > range; --i)
+				reserved_pfn_ranges[i]
+					 = reserved_pfn_ranges[i - 1];
+			range->pfn = pfn;
+			range->nr = nr;
+			break;
+		}
+	}
+	BUG_ON(i >= ARRAY_SIZE(reserved_pfn_ranges));
+	memblock_reserve(PFN_PHYS(pfn), PFN_PHYS(nr));
+}
+
+void __init reserve_pgtable_low(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+		struct reserved_pfn_range *range = reserved_pfn_ranges + i;
+
+		if (!range->nr)
+			break;
+		if (pgt_buf_start <= range->pfn && pgt_buf_top > range->pfn) {
+			x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+					PFN_PHYS(range->pfn));
+			pgt_buf_start = range->pfn + range->nr;
+		}
+	}
+}
+
+static __init unsigned long get_table_end(void)
+{
+	unsigned int i;
+
+	BUG_ON(!pgt_buf_end);
+	for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+		struct reserved_pfn_range *range = reserved_pfn_ranges + i;
+
+		if (!range->nr)
+			break;
+		if (pgt_buf_end == range->pfn) {
+			pgt_buf_end += range->nr;
+			pgt_buf_top += range->nr;
+		}
+	}
+	return pgt_buf_end++;
+}
+
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
+{
+	void *ptr;
+
+	if (after_bootmem)
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+	else if (pgt_buf_end < pgt_buf_top) {
+		ptr = __va(get_table_end() << PAGE_SHIFT);
+		clear_page(ptr);
+	} else
+		ptr = alloc_bootmem_pages(PAGE_SIZE);
+
+	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+		panic("set_pte_phys: cannot allocate page data %s\n",
+			after_bootmem ? "after bootmem" : "");
+	}
+
+	pr_debug("spp_getpage %p\n", ptr);
+
+	return ptr;
+}
+
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
+{
+	if (pgd_none(*pgd)) {
+		pud_t *pud = (pud_t *)spp_getpage();
+		if (!after_bootmem) {
+			make_page_readonly(pud, XENFEAT_writable_page_tables);
+			xen_l4_entry_update(pgd, __pgd(__pa(pud) | _PAGE_TABLE));
+		} else
+			pgd_populate(&init_mm, pgd, pud);
+		if (pud != pud_offset(pgd, 0))
+			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+			       pud, pud_offset(pgd, 0));
+	}
+	return pud_offset(pgd, vaddr);
+}
+
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
+	if (pud_none(*pud)) {
+		pmd_t *pmd = (pmd_t *) spp_getpage();
+		if (!after_bootmem) {
+			make_page_readonly(pmd, XENFEAT_writable_page_tables);
+			xen_l3_entry_update(pud, __pud(__pa(pmd) | _PAGE_TABLE));
+		} else
+			pud_populate(&init_mm, pud, pmd);
+		if (pmd != pmd_offset(pud, 0))
+			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+			       pmd, pmd_offset(pud, 0));
+	}
+	return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
+	if (pmd_none(*pmd)) {
+		pte_t *pte = (pte_t *) spp_getpage();
+		make_page_readonly(pte, XENFEAT_writable_page_tables);
+		pmd_populate_kernel(&init_mm, pmd, pte);
+		if (pte != pte_offset_kernel(pmd, 0))
+			printk(KERN_ERR "PAGETABLE BUG #02!\n");
+	}
+	return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pud = pud_page + pud_index(vaddr);
+	pmd = fill_pmd(pud, vaddr);
+	pte = fill_pte(pmd, vaddr);
+
+	set_pte(pte, new_pte);
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+	pgd_t *pgd;
+	pud_t *pud_page;
+
+	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
+
+	pgd = pgd_offset_k(vaddr);
+	if (pgd_none(*pgd)) {
+		printk(KERN_ERR
+			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
+		return;
+	}
+	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
+	set_pte_vaddr_pud(pud_page, vaddr, pteval);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+
+	pgd = pgd_offset_k(vaddr);
+	pud = fill_pud(pgd, vaddr);
+	return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return fill_pte(pmd, vaddr);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Create large page table mappings for a range of physical addresses.
+ */
+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+						pgprot_t prot)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+		pgd = pgd_offset_k((unsigned long)__va(phys));
+		if (pgd_none(*pgd)) {
+			pud = (pud_t *) spp_getpage();
+			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+						_PAGE_USER));
+		}
+		pud = pud_offset(pgd, (unsigned long)__va(phys));
+		if (pud_none(*pud)) {
+			pmd = (pmd_t *) spp_getpage();
+			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+						_PAGE_USER));
+		}
+		pmd = pmd_offset(pud, phys);
+		BUG_ON(!pmd_none(*pmd));
+		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
+	}
+}
+
+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+{
+	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
+}
+
+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+{
+	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
+}
+
+/*
+ * The head.S code sets up the kernel high mapping:
+ *
+ *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
+ *
+ * phys_addr holds the negative offset to the kernel, which is added
+ * to the compile time generated pmds. This results in invalid pmds up
+ * to the point where we hit the physaddr 0 mapping.
+ *
+ * We limit the mappings to the region from _text to _brk_end.  _brk_end
+ * is rounded up to the 2MB boundary. This catches the invalid pmds as
+ * well, as they are located before _text:
+ */
+void __init cleanup_highmap(void)
+{
+	unsigned long vaddr = __START_KERNEL_map;
+	unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
+	pmd_t *pmd = level2_kernel_pgt;
+
+	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
+		if (pmd_none(*pmd))
+			continue;
+		if (vaddr < (unsigned long) _text || vaddr > end)
+			set_pmd(pmd, __pmd(0));
+	}
+}
+#endif
+
+static __ref void *alloc_low_page(unsigned long *phys)
+{
+	unsigned long pfn;
+	void *adr;
+
+	if (after_bootmem) {
+		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+		*phys = __pa(adr);
+
+		return adr;
+	}
+
+	pfn = get_table_end();
+	if (pfn >= pgt_buf_top)
+		panic("alloc_low_page: ran out of memory");
+
+	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
+	clear_page(adr);
+	*phys  = pfn * PAGE_SIZE;
+	return adr;
+}
+
+static __ref void *map_low_page(void *virt)
+{
+	void *adr;
+	unsigned long phys, left;
+
+	if (after_bootmem)
+		return virt;
+
+	phys = __pa(virt);
+	left = phys & (PAGE_SIZE - 1);
+	adr = early_memremap_ro(phys & PAGE_MASK, PAGE_SIZE);
+	adr = (void *)(((unsigned long)adr) | left);
+
+	return adr;
+}
+
+static __ref void unmap_low_page(void *adr)
+{
+	if (after_bootmem)
+		return;
+
+	early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
+}
+
+static inline int __meminit make_readonly(unsigned long paddr)
+{
+	int readonly = 0;
+
+	/* Make new page tables read-only on the first pass. */
+	if (!xen_feature(XENFEAT_writable_page_tables)
+	    && !max_pfn_mapped
+	    && (paddr >= (pgt_buf_start << PAGE_SHIFT))) {
+		unsigned long top = pgt_buf_top;
+		unsigned int i;
+
+		/* Account for the ranges get_table_end() skips. */
+		for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+			const struct reserved_pfn_range *range;
+
+			range = reserved_pfn_ranges + i;
+			if (!range->nr)
+				continue;
+			if (pgt_buf_end <= range->pfn && top > range->pfn) {
+				if (paddr > (range->pfn << PAGE_SHIFT)
+				    && paddr < ((range->pfn + range->nr)
+					        << PAGE_SHIFT))
+					break;
+				top += range->nr;
+			}
+		}
+		if (paddr < (top << PAGE_SHIFT))
+			readonly = (i >= ARRAY_SIZE(reserved_pfn_ranges));
+	}
+	/* Make old page tables read-only. */
+	if (!xen_feature(XENFEAT_writable_page_tables)
+	    && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
+	    && (paddr < (pgt_buf_end << PAGE_SHIFT)))
+		readonly = 1;
+	/* Make P->M table (and its page tables) read-only. */
+	if (!xen_feature(XENFEAT_writable_page_tables)
+	    && xen_start_info->mfn_list < __START_KERNEL_map
+	    && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
+	    && paddr < (xen_start_info->first_p2m_pfn
+			+ xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
+		readonly = 1;
+
+	/*
+	 * No need for writable mapping of kernel image. This also ensures that
+	 * page and descriptor tables embedded inside don't have writable
+	 * mappings. The range must be in sync with that passed to
+	 * reserve_early() (as "TEXT DATA BSS"), since all other regions can be
+	 * allocated from under CONFIG_NO_BOOTMEM and thus must be writable.
+	 */
+	if ((paddr >= __pa_symbol(&_text))
+            && (paddr < (__pa_symbol(__bss_stop) & PAGE_MASK)))
+		readonly = 1;
+
+	return readonly;
+}
+
+static unsigned long __meminit
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+	      pgprot_t prot)
+{
+	unsigned pages = 0;
+	unsigned long last_map_addr = end;
+	int i;
+
+	pte_t *pte = pte_page + pte_index(addr);
+
+	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
+		unsigned long pteval = addr | pgprot_val(prot);
+
+		if (addr >= end ||
+		    (!after_bootmem &&
+		     (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
+			break;
+
+		/*
+		 * We will re-use the existing mapping.
+		 * Xen for example has some special requirements, like mapping
+		 * pagetable pages as RO. So assume someone who pre-setup
+		 * these mappings are more intelligent.
+		 */
+		if (__pte_val(*pte)) {
+			pages++;
+			continue;
+		}
+
+		if (make_readonly(addr))
+			pteval &= ~_PAGE_RW;
+		if (0)
+			printk("   pte=%p addr=%lx pte=%016lx\n",
+			       pte, addr, pteval);
+		pages++;
+		if (!after_bootmem)
+			*pte = __pte(pteval & __supported_pte_mask);
+		else
+			set_pte(pte, __pte(pteval & __supported_pte_mask));
+		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+	}
+
+	update_page_count(PG_LEVEL_4K, pages);
+
+	return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+	      unsigned long page_size_mask, pgprot_t prot)
+{
+	unsigned long pages = 0;
+	unsigned long last_map_addr = end;
+
+	int i = pmd_index(address);
+
+	for (; i < PTRS_PER_PMD; i++, address = (address & PMD_MASK) + PMD_SIZE) {
+		unsigned long pte_phys;
+		pmd_t *pmd = pmd_page + pmd_index(address);
+		pte_t *pte;
+		pgprot_t new_prot = prot;
+
+		if (address >= end)
+			break;
+
+		if (__pmd_val(*pmd)) {
+			if (!pmd_large(*pmd)) {
+				spin_lock(&init_mm.page_table_lock);
+				pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+				last_map_addr = phys_pte_init(pte, address,
+								end, prot);
+				unmap_low_page(pte);
+				spin_unlock(&init_mm.page_table_lock);
+				continue;
+			}
+			/*
+			 * If we are ok with PG_LEVEL_2M mapping, then we will
+			 * use the existing mapping,
+			 *
+			 * Otherwise, we will split the large page mapping but
+			 * use the same existing protection bits except for
+			 * large page, so that we don't violate Intel's TLB
+			 * Application note (317080) which says, while changing
+			 * the page sizes, new and old translations should
+			 * not differ with respect to page frame and
+			 * attributes.
+			 */
+			if (page_size_mask & (1 << PG_LEVEL_2M)) {
+				pages++;
+				continue;
+			}
+			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
+		}
+
+		if (page_size_mask & (1<<PG_LEVEL_2M)) {
+			pages++;
+			spin_lock(&init_mm.page_table_lock);
+			set_pte((pte_t *)pmd,
+				pfn_pte(address >> PAGE_SHIFT,
+					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
+			spin_unlock(&init_mm.page_table_lock);
+			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
+			continue;
+		}
+
+		pte = alloc_low_page(&pte_phys);
+		last_map_addr = phys_pte_init(pte, address, end, new_prot);
+		unmap_low_page(pte);
+
+		if (!after_bootmem) {
+			if (max_pfn_mapped)
+				make_page_readonly(__va(pte_phys),
+						   XENFEAT_writable_page_tables);
+			if (page_size_mask & (1 << PG_LEVEL_NUM)) {
+				mmu_update_t u;
+
+				u.ptr = arbitrary_virt_to_machine(pmd);
+				u.val = phys_to_machine(pte_phys) | _PAGE_TABLE;
+				if (HYPERVISOR_mmu_update(&u, 1, NULL,
+							  DOMID_SELF) < 0)
+					BUG();
+			} else
+				*pmd = __pmd(pte_phys | _PAGE_TABLE);
+		} else {
+			spin_lock(&init_mm.page_table_lock);
+			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+			spin_unlock(&init_mm.page_table_lock);
+		}
+	}
+	update_page_count(PG_LEVEL_2M, pages);
+	return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
+			 unsigned long page_size_mask)
+{
+	unsigned long pages = 0;
+	unsigned long last_map_addr = end;
+	int i = pud_index(addr);
+
+	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
+		unsigned long pmd_phys;
+		pud_t *pud = pud_page + pud_index(addr);
+		pmd_t *pmd;
+		pgprot_t prot = PAGE_KERNEL;
+
+		if (addr >= end)
+			break;
+
+		if (__pud_val(*pud)) {
+			if (!pud_large(*pud)) {
+				pmd = map_low_page(pmd_offset(pud, 0));
+				last_map_addr = phys_pmd_init(pmd, addr, end,
+					page_size_mask | (1 << PG_LEVEL_NUM),
+					prot);
+				unmap_low_page(pmd);
+				__flush_tlb_all();
+				continue;
+			}
+			/*
+			 * If we are ok with PG_LEVEL_1G mapping, then we will
+			 * use the existing mapping.
+			 *
+			 * Otherwise, we will split the gbpage mapping but use
+			 * the same existing protection  bits except for large
+			 * page, so that we don't violate Intel's TLB
+			 * Application note (317080) which says, while changing
+			 * the page sizes, new and old translations should
+			 * not differ with respect to page frame and
+			 * attributes.
+			 */
+			if (page_size_mask & (1 << PG_LEVEL_1G)) {
+				pages++;
+				continue;
+			}
+			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
+		}
+
+		if (page_size_mask & (1<<PG_LEVEL_1G)) {
+			pages++;
+			spin_lock(&init_mm.page_table_lock);
+			set_pte((pte_t *)pud,
+				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+			spin_unlock(&init_mm.page_table_lock);
+			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
+			continue;
+		}
+
+		pmd = alloc_low_page(&pmd_phys);
+		last_map_addr = phys_pmd_init(pmd, addr, end,
+					      page_size_mask & ~(1 << PG_LEVEL_NUM),
+					      prot);
+		unmap_low_page(pmd);
+
+		if (!after_bootmem) {
+			if (max_pfn_mapped)
+				make_page_readonly(__va(pmd_phys),
+						   XENFEAT_writable_page_tables);
+			if (page_size_mask & (1 << PG_LEVEL_NUM)) {
+				mmu_update_t u;
+
+				u.ptr = arbitrary_virt_to_machine(pud);
+				u.val = phys_to_machine(pmd_phys) | _PAGE_TABLE;
+				if (HYPERVISOR_mmu_update(&u, 1, NULL,
+							  DOMID_SELF) < 0)
+					BUG();
+			} else
+				*pud = __pud(pmd_phys | _PAGE_TABLE);
+		} else {
+			spin_lock(&init_mm.page_table_lock);
+			pud_populate(&init_mm, pud, __va(pmd_phys));
+			spin_unlock(&init_mm.page_table_lock);
+		}
+	}
+	__flush_tlb_all();
+
+	update_page_count(PG_LEVEL_1G, pages);
+
+	return last_map_addr;
+}
+
+void __init xen_init_pt(void)
+{
+	unsigned long addr, *page;
+
+	/* Find the initial pte page that was built for us. */
+	page = (unsigned long *)xen_start_info->pt_base;
+	addr = page[pgd_index(__START_KERNEL_map)];
+	addr_to_page(addr, page);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+	/* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
+	   in kernel PTEs. We check that here. */
+	if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
+		unsigned long *pg;
+		pte_t pte;
+
+		/* Mess with the initial mapping of page 0. It's not needed. */
+		BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
+		addr = page[pud_index(__START_KERNEL_map)];
+		addr_to_page(addr, pg);
+		addr = pg[pmd_index(__START_KERNEL_map)];
+		addr_to_page(addr, pg);
+		pte.pte = pg[pte_index(__START_KERNEL_map)];
+		BUG_ON(!(pte.pte & _PAGE_PRESENT));
+
+		/* If _PAGE_USER isn't set, we obviously do not need it. */
+		if (pte.pte & _PAGE_USER) {
+			/* _PAGE_USER is needed, but is it set implicitly? */
+			pte.pte &= ~_PAGE_USER;
+			if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
+							  pte, 0) != 0) ||
+			    !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
+				/* We need to explicitly specify _PAGE_USER. */
+				__kernel_page_user = _PAGE_USER;
+		}
+	}
+#endif
+
+	/* Construct mapping of initial pte page in our own directories. */
+	init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
+		__pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
+	memcpy(level3_kernel_pgt + pud_index(__START_KERNEL_map),
+	       page + pud_index(__START_KERNEL_map),
+	       (PTRS_PER_PUD - pud_index(__START_KERNEL_map))
+	       * sizeof(*level3_kernel_pgt));
+
+	/* Copy the initial P->M table mappings if necessary. */
+	addr = pgd_index(xen_start_info->mfn_list);
+	if (addr < pgd_index(__START_KERNEL_map))
+		init_level4_pgt[addr] =
+			((pgd_t *)xen_start_info->pt_base)[addr];
+
+	/* Do an early initialization of the fixmap area. */
+	addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+	if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
+		unsigned long adr = page[pud_index(addr)];
+
+		addr_to_page(adr, page);
+		copy_page(level2_fixmap_pgt, page);
+	}
+	level3_kernel_pgt[pud_index(addr)] =
+		__pud(__pa_symbol(level2_fixmap_pgt) | _PAGE_TABLE);
+	level2_fixmap_pgt[pmd_index(addr)] =
+		__pmd(__pa_symbol(level1_fixmap_pgt) | _PAGE_TABLE);
+
+	early_make_page_readonly(init_level4_pgt,
+				 XENFEAT_writable_page_tables);
+	early_make_page_readonly(level3_kernel_pgt,
+				 XENFEAT_writable_page_tables);
+	early_make_page_readonly(level3_user_pgt,
+				 XENFEAT_writable_page_tables);
+	early_make_page_readonly(level2_fixmap_pgt,
+				 XENFEAT_writable_page_tables);
+	early_make_page_readonly(level1_fixmap_pgt,
+				 XENFEAT_writable_page_tables);
+
+	if (!xen_feature(XENFEAT_writable_page_tables))
+		xen_pgd_pin(init_level4_pgt);
+}
+
+void __init xen_finish_init_mapping(void)
+{
+	unsigned long start, end;
+	struct mmuext_op mmuext;
+
+	/* Re-vector virtual addresses pointing into the initial
+	   mapping to the just-established permanent ones. */
+	xen_start_info = __va(__pa(xen_start_info));
+	xen_start_info->pt_base = (unsigned long)
+		__va(__pa(xen_start_info->pt_base));
+	if (!xen_feature(XENFEAT_auto_translated_physmap)
+	    && xen_start_info->mfn_list >= __START_KERNEL_map)
+		phys_to_machine_mapping =
+			__va(__pa(xen_start_info->mfn_list));
+
+	/* Unpin the no longer used Xen provided page tables. */
+	mmuext.cmd = MMUEXT_UNPIN_TABLE;
+	mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base);
+	if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
+		BUG();
+
+	/* Destroy the Xen-created mappings beyond the kernel image. */
+	start = PAGE_ALIGN(_brk_end);
+	end   = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
+	for (; start < end; start += PAGE_SIZE)
+		if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
+			BUG();
+
+	WARN(pgt_buf_end != pgt_buf_top, "start=%lx cur=%lx top=%lx\n",
+	     pgt_buf_start, pgt_buf_end, pgt_buf_top);
+	if (pgt_buf_end > pgt_buf_top)
+		pgt_buf_top = pgt_buf_end;
+}
+
+unsigned long __meminit
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
+{
+	bool pgd_changed = false;
+	unsigned long next, last_map_addr = end;
+	unsigned long addr;
+
+	start = (unsigned long)__va(start);
+	end = (unsigned long)__va(end);
+	addr = start;
+
+	for (; start < end; start = next) {
+		pgd_t *pgd = pgd_offset_k(start);
+		unsigned long pud_phys;
+		pud_t *pud;
+
+		next = (start + PGDIR_SIZE) & PGDIR_MASK;
+		if (next > end)
+			next = end;
+
+		if (__pgd_val(*pgd)) {
+			pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+			last_map_addr = phys_pud_init(pud, __pa(start),
+				__pa(end), page_size_mask | (1 << PG_LEVEL_NUM));
+			unmap_low_page(pud);
+			continue;
+		}
+
+		pud = alloc_low_page(&pud_phys);
+		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+						 page_size_mask);
+		unmap_low_page(pud);
+
+		if (!after_bootmem) {
+			if (max_pfn_mapped)
+				make_page_readonly(__va(pud_phys),
+						   XENFEAT_writable_page_tables);
+			xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
+		} else {
+			spin_lock(&init_mm.page_table_lock);
+			pgd_populate(&init_mm, pgd, __va(pud_phys));
+			spin_unlock(&init_mm.page_table_lock);
+			pgd_changed = true;
+		}
+	}
+
+	if (pgd_changed)
+		sync_global_pgds(addr, end);
+
+	return last_map_addr;
+}
+
+#ifndef CONFIG_NUMA
+void __init initmem_init(void)
+{
+	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+}
+#endif
+
+void __init paging_init(void)
+{
+	sparse_memory_present_with_active_regions(MAX_NUMNODES);
+	sparse_init();
+
+	/*
+	 * clear the default setting with node 0
+	 * note: don't use nodes_clear here, that is really clearing when
+	 *	 numa support is not compiled in, and later node_set_state
+	 *	 will not set it back.
+	 */
+	node_clear_state(0, N_NORMAL_MEMORY);
+
+	zone_sizes_init();
+}
+
+/*
+ * Memory hotplug specific functions
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
+ */
+static void  update_end_of_memory_vars(u64 start, u64 size)
+{
+	unsigned long end_pfn = PFN_UP(start + size);
+
+	if (end_pfn > max_pfn) {
+		max_pfn = end_pfn;
+		max_low_pfn = end_pfn;
+		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+	}
+}
+
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int ret;
+
+	last_mapped_pfn = init_memory_mapping(start, start + size);
+	if (last_mapped_pfn > max_pfn_mapped)
+		max_pfn_mapped = last_mapped_pfn;
+
+	ret = __add_pages(nid, zone, start_pfn, nr_pages);
+	WARN_ON_ONCE(ret);
+
+	/* update max_pfn, max_low_pfn and high_memory */
+	update_end_of_memory_vars(start, size);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(arch_add_memory);
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static struct kcore_list kcore_vsyscall;
+
+void __init mem_init(void)
+{
+	long codesize, reservedpages, datasize, initsize;
+	unsigned long absent_pages;
+	unsigned long pfn;
+
+	pci_iommu_alloc();
+
+	/* clear_bss() already clear the empty_zero_page */
+
+	reservedpages = 0;
+
+	/* this will put all low memory onto the freelists */
+#ifdef CONFIG_NUMA
+	totalram_pages = numa_free_all_bootmem();
+#else
+	totalram_pages = free_all_bootmem();
+#endif
+
+	/* XEN: init pages outside initial allocation. */
+	for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+		ClearPageReserved(pfn_to_page(pfn));
+		init_page_count(pfn_to_page(pfn));
+	}
+
+	absent_pages = absent_pages_in_range(0, max_pfn);
+	reservedpages = max_pfn - totalram_pages - absent_pages;
+	after_bootmem = 1;
+
+	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+	/* Register memory areas for /proc/kcore */
+	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
+			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
+
+	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
+			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
+		nr_free_pages() << (PAGE_SHIFT-10),
+		max_pfn << (PAGE_SHIFT-10),
+		codesize >> 10,
+		absent_pages << (PAGE_SHIFT-10),
+		reservedpages << (PAGE_SHIFT-10),
+		datasize >> 10,
+		initsize >> 10);
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read write\n",
+		 start, end);
+
+	/*
+	 * Make the kernel identity mapping for text RW. Kernel text
+	 * mapping will always be RO. Refer to the comment in
+	 * static_protections() in pageattr.c
+	 */
+	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read only\n",
+		 start, end);
+
+	/*
+	 * Set the kernel identity mapping for text RO.
+	 */
+	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long rodata_start =
+		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+	unsigned long end = (unsigned long) &__end_rodata;
+	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
+	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
+	unsigned long data_start = (unsigned long) &_sdata;
+
+	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+	       (end - start) >> 10);
+	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+
+	kernel_set_to_readonly = 1;
+
+	/*
+	 * The rodata section (but not the kernel text!) should also be
+	 * not-executable.
+	 */
+	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+
+	rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Testing CPA: again\n");
+	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
+
+	free_init_pages("unused kernel memory",
+			(unsigned long) page_address(virt_to_page(text_end)),
+			(unsigned long)
+				 page_address(virt_to_page(rodata_start)));
+	free_init_pages("unused kernel memory",
+			(unsigned long) page_address(virt_to_page(rodata_end)),
+			(unsigned long) page_address(virt_to_page(data_start)));
+}
+
+#endif
+
+int kern_addr_valid(unsigned long addr)
+{
+	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	if (above != 0 && above != -1UL)
+		return 0;
+
+#ifdef CONFIG_XEN
+	/*
+	 * Don't walk page tables for hypervisor addresses, but allow
+	 * the M2P table to be accessed through e.g. /proc/kcore.
+	 */
+	if (addr >= (unsigned long)machine_to_phys_mapping &&
+	    addr < (unsigned long)(machine_to_phys_mapping +
+				   machine_to_phys_nr))
+		return 1;
+	if (addr >= HYPERVISOR_VIRT_START && addr < HYPERVISOR_VIRT_END)
+		return 0;
+#endif
+
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud))
+		return 0;
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return 0;
+
+	if (pmd_large(*pmd))
+		return pfn_valid(pmd_pfn(*pmd));
+
+	pte = pte_offset_kernel(pmd, addr);
+	if (pte_none(*pte))
+		return 0;
+
+	return pfn_valid(pte_pfn(*pte));
+}
+
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
+static struct vm_area_struct gate_vma = {
+	.vm_start	= VSYSCALL_START,
+	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+	.vm_page_prot	= PAGE_READONLY_EXEC,
+	.vm_flags	= VM_READ | VM_EXEC
+};
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (!mm || mm->context.ia32_compat)
+		return NULL;
+#endif
+	return &gate_vma;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+	struct vm_area_struct *vma = get_gate_vma(mm);
+
+	if (!vma)
+		return 0;
+
+	return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/*
+ * Use this when you have no reliable mm, typically from interrupt
+ * context. It is less reliable than using a task's mm and may give
+ * false positives.
+ */
+int in_gate_area_no_mm(unsigned long addr)
+{
+	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+		return "[vdso]";
+	if (vma == &gate_vma)
+		return "[vsyscall]";
+	return NULL;
+}
+
+#ifdef CONFIG_X86_UV
+unsigned long memory_block_size_bytes(void)
+{
+	if (is_uv_system()) {
+		printk(KERN_INFO "UV: memory block size 2GB\n");
+		return 2UL * 1024 * 1024 * 1024;
+	}
+	return MIN_MEMORY_BLOCK_SIZE;
+}
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
+ */
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
+int __meminit
+vmemmap_populate(struct page *start_page, unsigned long size, int node)
+{
+	unsigned long addr = (unsigned long)start_page;
+	unsigned long end = (unsigned long)(start_page + size);
+	unsigned long next;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	for (; addr < end; addr = next) {
+		void *p = NULL;
+
+		pgd = vmemmap_pgd_populate(addr, node);
+		if (!pgd)
+			return -ENOMEM;
+
+		pud = vmemmap_pud_populate(pgd, addr, node);
+		if (!pud)
+			return -ENOMEM;
+
+		if (!cpu_has_pse) {
+			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			pmd = vmemmap_pmd_populate(pud, addr, node);
+
+			if (!pmd)
+				return -ENOMEM;
+
+			p = vmemmap_pte_populate(pmd, addr, node);
+
+			if (!p)
+				return -ENOMEM;
+
+			addr_end = addr + PAGE_SIZE;
+			p_end = p + PAGE_SIZE;
+		} else {
+			next = pmd_addr_end(addr, end);
+
+			pmd = pmd_offset(pud, addr);
+			if (pmd_none(*pmd)) {
+				pte_t entry;
+
+				p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+				if (!p)
+					return -ENOMEM;
+
+				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+						PAGE_KERNEL_LARGE);
+				set_pmd(pmd, __pmd_ma(__pte_val(entry)));
+
+				/* check to see if we have contiguous blocks */
+				if (p_end != p || node_start != node) {
+					if (p_start)
+						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+						       addr_start, addr_end-1, p_start, p_end-1, node_start);
+					addr_start = addr;
+					node_start = node;
+					p_start = p;
+				}
+
+				addr_end = addr + PMD_SIZE;
+				p_end = p + PMD_SIZE;
+			} else
+				vmemmap_verify((pte_t *)pmd, node, addr, next);
+		}
+
+	}
+	sync_global_pgds((unsigned long)start_page, end);
+	return 0;
+}
+
+void __meminit vmemmap_populate_print_last(void)
+{
+	if (p_start) {
+		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+			addr_start, addr_end-1, p_start, p_end-1, node_start);
+		p_start = NULL;
+		p_end = NULL;
+		node_start = 0;
+	}
+}
+#endif
diff --git a/arch/x86/mm/iomap_32-xen.c b/arch/x86/mm/iomap_32-xen.c
new file mode 100644
index 0000000..bbd4134
--- /dev/null
+++ b/arch/x86/mm/iomap_32-xen.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright Â© 2008 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <asm/iomap.h>
+#include <asm/pat.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
+{
+#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
+	/* There is no way to map greater than 1 << 32 address without PAE */
+	if (base + size > 0x100000000ULL)
+		return 0;
+#endif
+	return 1;
+}
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+	unsigned long flag = _PAGE_CACHE_WC;
+	int ret;
+
+	if (!is_io_mapping_possible(base, size))
+		return -EINVAL;
+
+	ret = io_reserve_memtype(base, base + size, &flag);
+	if (ret)
+		return ret;
+
+	*prot = __pgprot(__PAGE_KERNEL | flag);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void iomap_free(resource_size_t base, unsigned long size)
+{
+	io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
+
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+{
+	unsigned long vaddr;
+	int idx, type;
+
+	pagefault_disable();
+
+	type = kmap_atomic_idx_push();
+	idx = type + KM_TYPE_NR * smp_processor_id();
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	set_pte_at(&init_mm, vaddr, kmap_pte - idx, pfn_pte(pfn, prot));
+	/*arch_flush_lazy_mmu_mode();*/
+
+	return (void *)vaddr;
+}
+
+/*
+ * Map 'mfn' using protections 'prot'
+ */
+void __iomem *
+iomap_atomic_prot_pfn(unsigned long mfn, pgprot_t prot)
+{
+	/*
+	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
+	 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
+	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
+	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
+	 */
+	if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
+		prot = PAGE_KERNEL_UC_MINUS;
+
+	pgprot_val(prot) |= _PAGE_IOMAP;
+	return (void __force __iomem *) kmap_atomic_prot_pfn(mfn, prot);
+}
+EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
+
+void
+iounmap_atomic(void __iomem *kvaddr)
+{
+	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+		int idx, type;
+
+		type = kmap_atomic_idx();
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+		/*
+		 * Force other mappings to Oops if they'll try to access this
+		 * pte without first remap it.  Keeping stale mappings around
+		 * is a bad idea also, in case the page changes cacheability
+		 * attributes or becomes a protected page in a hypervisor.
+		 */
+		kpte_clear_flush(kmap_pte-idx, vaddr);
+		kmap_atomic_idx_pop();
+	}
+
+	pagefault_enable();
+}
+EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/ioremap-xen.c b/arch/x86/mm/ioremap-xen.c
new file mode 100644
index 0000000..5e5e402
--- /dev/null
+++ b/arch/x86/mm/ioremap-xen.c
@@ -0,0 +1,827 @@
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/pat.h>
+
+#include "physaddr.h"
+
+static int direct_remap_area_pte_fn(pte_t *pte,
+				    struct page *pmd_page,
+				    unsigned long address,
+				    void *data)
+{
+	mmu_update_t **v = (mmu_update_t **)data;
+
+	BUG_ON(!pte_none(*pte));
+
+	(*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+		     PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+	(*v)++;
+
+	return 0;
+}
+
+static int __direct_remap_pfn_range(struct mm_struct *mm,
+				    unsigned long address,
+				    phys_addr_t mfn,
+				    unsigned long size,
+				    pgprot_t prot,
+				    domid_t  domid)
+{
+	int rc = 0;
+	unsigned long i, start_address;
+	mmu_update_t *u, *v, *w;
+
+	u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+	if (u == NULL)
+		return -ENOMEM;
+
+	start_address = address;
+
+	flush_cache_all();
+
+	for (i = 0; i < size; i += PAGE_SIZE) {
+		if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
+			/* Flush a full batch after filling in the PTE ptrs. */
+			rc = apply_to_page_range(mm, start_address,
+						 address - start_address,
+						 direct_remap_area_pte_fn, &w);
+			if (rc)
+				goto out;
+			rc = HYPERVISOR_mmu_update(u, v - u, NULL, domid);
+			if (rc < 0)
+				goto out;
+			v = w = u;
+			start_address = address;
+		}
+
+		/*
+		 * Fill in the machine address: PTE ptr is done later by
+		 * apply_to_page_range().
+		 */
+		pgprot_val(prot) |= _PAGE_IOMAP;
+		v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot)));
+
+		mfn++;
+		address += PAGE_SIZE;
+		v++;
+	}
+
+	if (v != u) {
+		/* Final batch. */
+		rc = apply_to_page_range(mm, start_address,
+					 address - start_address,
+					 direct_remap_area_pte_fn, &w);
+		if (rc)
+			goto out;
+		rc = HYPERVISOR_mmu_update(u, v - u, NULL, domid);
+	}
+
+ out:
+	flush_tlb_all();
+
+	free_page((unsigned long)u);
+
+	return rc;
+}
+
+int direct_remap_pfn_range(struct vm_area_struct *vma,
+			   unsigned long address,
+			   phys_addr_t mfn,
+			   unsigned long size,
+			   pgprot_t prot,
+			   domid_t  domid)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return remap_pfn_range(vma, address, mfn, size, prot);
+
+	if (domid == DOMID_SELF)
+		return -EINVAL;
+
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+	vma->vm_mm->context.has_foreign_mappings = 1;
+
+	return __direct_remap_pfn_range(
+		vma->vm_mm, address, mfn, size, prot, domid);
+}
+EXPORT_SYMBOL(direct_remap_pfn_range);
+
+int direct_kernel_remap_pfn_range(unsigned long address,
+				  unsigned long mfn,
+				  unsigned long size,
+				  pgprot_t prot,
+				  domid_t  domid)
+{
+	return __direct_remap_pfn_range(
+		&init_mm, address, mfn, size, prot, domid);
+}
+EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
+
+static int lookup_pte_fn(
+	pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+	uint64_t *ptep = (uint64_t *)data;
+	if (ptep)
+		*ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+			 PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+	return 0;
+}
+
+int create_lookup_pte_addr(struct mm_struct *mm,
+			   unsigned long address,
+			   uint64_t *ptep)
+{
+	return apply_to_page_range(mm, address, PAGE_SIZE,
+				   lookup_pte_fn, ptep);
+}
+
+EXPORT_SYMBOL(create_lookup_pte_addr);
+
+#ifdef CONFIG_MODULES
+/*
+ * Force the implementation of ioremap_page_range() to be pulled in from
+ * lib/lib.a even if there is no other reference from the core kernel to it
+ * (native uses it in __ioremap_caller()), so that it gets exported.
+ */
+static void *const __section(.discard.ioremap) __used
+_ioremap_page_range = ioremap_page_range;
+#endif
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+			       unsigned long prot_val)
+{
+	unsigned long nrpages = size >> PAGE_SHIFT;
+	int err;
+
+	switch (prot_val) {
+	case _PAGE_CACHE_UC:
+	default:
+		err = _set_memory_uc(vaddr, nrpages);
+		break;
+	case _PAGE_CACHE_WC:
+		err = _set_memory_wc(vaddr, nrpages);
+		break;
+	case _PAGE_CACHE_WB:
+		err = _set_memory_wb(vaddr, nrpages);
+		break;
+	}
+
+	return err;
+}
+
+int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
+			      unsigned long prot_val)
+{
+	unsigned long sz;
+	int rc;
+
+	for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
+		unsigned long pfn = mfn_to_local_pfn(mfn);
+
+		if (pfn >= max_low_pfn_mapped &&
+		    (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
+			continue;
+		rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
+					 PAGE_SIZE, prot_val);
+	}
+
+	return rc;
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+		unsigned long size, unsigned long prot_val, void *caller)
+{
+	unsigned long offset, vaddr;
+	phys_addr_t mfn, last_mfn, last_addr;
+	const resource_size_t unaligned_phys_addr = phys_addr;
+	const unsigned long unaligned_size = size;
+	struct vm_struct *area;
+	unsigned long new_prot_val;
+	pgprot_t prot;
+	int retval;
+	domid_t domid = DOMID_IO;
+	void __iomem *ret_addr;
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr)
+		return NULL;
+
+	if (!phys_addr_valid(phys_addr)) {
+		printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
+		       (unsigned long long)phys_addr);
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
+	/*
+	 * Don't remap the low PCI/ISA area, it's always mapped..
+	 */
+	if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
+		return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
+
+	/*
+	 * Don't allow anybody to remap normal RAM that we're using..
+	 */
+	last_mfn = PFN_DOWN(last_addr);
+	for (mfn = PFN_DOWN(phys_addr); mfn <= last_mfn; mfn++) {
+		unsigned long pfn = mfn_to_local_pfn(mfn);
+
+		if (pfn_valid(pfn)) {
+			if (!PageReserved(pfn_to_page(pfn)))
+				return NULL;
+			domid = DOMID_SELF;
+		}
+	}
+	WARN_ON_ONCE(domid == DOMID_SELF);
+
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PHYSICAL_PAGE_MASK;
+	size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+	retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
+						prot_val, &new_prot_val);
+	if (retval) {
+		printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
+		return NULL;
+	}
+
+	if (prot_val != new_prot_val) {
+		if (!is_new_memtype_allowed(phys_addr, size,
+					    prot_val, new_prot_val)) {
+			printk(KERN_ERR
+		"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
+				(unsigned long long)phys_addr,
+				(unsigned long long)(phys_addr + size),
+				prot_val, new_prot_val);
+			goto err_free_memtype;
+		}
+		prot_val = new_prot_val;
+	}
+
+	switch (prot_val) {
+	case _PAGE_CACHE_UC:
+	default:
+		prot = PAGE_KERNEL_IO_NOCACHE;
+		break;
+	case _PAGE_CACHE_UC_MINUS:
+		prot = PAGE_KERNEL_IO_UC_MINUS;
+		break;
+	case _PAGE_CACHE_WC:
+		prot = PAGE_KERNEL_IO_WC;
+		break;
+	case _PAGE_CACHE_WB:
+		prot = PAGE_KERNEL_IO;
+		break;
+	}
+
+	/*
+	 * Ok, go for it..
+	 */
+	area = get_vm_area_caller(size, VM_IOREMAP, caller);
+	if (!area)
+		goto err_free_memtype;
+	area->phys_addr = phys_addr;
+	vaddr = (unsigned long) area->addr;
+
+	if (kernel_map_sync_memtype(phys_addr, size, prot_val))
+		goto err_free_area;
+
+	if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
+				     size, prot, domid))
+		goto err_free_area;
+
+	ret_addr = (void __iomem *) (vaddr + offset);
+	mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+
+	/*
+	 * Check if the request spans more than any BAR in the iomem resource
+	 * tree.
+	 */
+	WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+		  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
+	return ret_addr;
+err_free_area:
+	free_vm_area(area);
+err_free_memtype:
+	free_memtype(phys_addr, phys_addr + size);
+	return NULL;
+}
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
+{
+	/*
+	 * Ideally, this should be:
+	 *	pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+	 *
+	 * Till we fix all X drivers to use ioremap_wc(), we will use
+	 * UC MINUS.
+	 */
+	unsigned long val = _PAGE_CACHE_UC_MINUS;
+
+	return __ioremap_caller(phys_addr, size, val,
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+/**
+ * ioremap_wc	-	map memory into CPU space write combined
+ * @offset:	bus address of the memory
+ * @size:	size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
+{
+	if (pat_enabled)
+		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
+					__builtin_return_address(0));
+	else
+		return ioremap_nocache(phys_addr, size);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
+{
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+				unsigned long prot_val)
+{
+	return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+	struct vm_struct *p, *o;
+
+	if ((void __force *)addr <= high_memory)
+		return;
+
+	/*
+	 * __ioremap special-cases the PCI/ISA range by not instantiating a
+	 * vm_area and by simply returning an address into the kernel mapping
+	 * of ISA space.   So handle that here.
+	 */
+	if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+		return;
+
+	addr = (volatile void __iomem *)
+		(PAGE_MASK & (unsigned long __force)addr);
+
+	mmiotrace_iounmap(addr);
+
+	/* Use the vm area unlocked, assuming the caller
+	   ensures there isn't another iounmap for the same address
+	   in parallel. Reuse of the virtual address is prevented by
+	   leaving it in the global lists until we're done with it.
+	   cpa takes care of the direct mappings. */
+	read_lock(&vmlist_lock);
+	for (p = vmlist; p; p = p->next) {
+		if (p->addr == (void __force *)addr)
+			break;
+	}
+	read_unlock(&vmlist_lock);
+
+	if (!p) {
+		printk(KERN_ERR "iounmap: bad address %p\n", addr);
+		dump_stack();
+		return;
+	}
+
+	free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+
+	/* Finally remove it */
+	o = remove_vm_area((void __force *)addr);
+	BUG_ON(p != o || o == NULL);
+	kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+#ifndef CONFIG_XEN
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(unsigned long phys)
+{
+	void *addr;
+	unsigned long start = phys & PAGE_MASK;
+
+	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+	if (page_is_ram(start >> PAGE_SHIFT))
+		return __va(phys);
+
+	addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
+	if (addr)
+		addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+
+	return addr;
+}
+
+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+	if (page_is_ram(phys >> PAGE_SHIFT))
+		return;
+
+	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+	return;
+}
+#endif
+
+static int __initdata early_ioremap_debug;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+	early_ioremap_debug = 1;
+
+	return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static __initdata int after_paging_init;
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
+
+#ifdef CONFIG_X86_32
+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
+{
+	/* Don't assume we're using swapper_pg_dir at this point */
+	pgd_t *base = __va(read_cr3());
+	pgd_t *pgd = &base[pgd_index(addr)];
+	pud_t *pud = pud_offset(pgd, addr);
+	pmd_t *pmd = pmd_offset(pud, addr);
+
+	return pmd;
+}
+#else
+#define early_ioremap_pmd early_get_pmd
+#undef make_lowmem_page_readonly
+#define make_lowmem_page_readonly early_make_page_readonly
+#endif
+
+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
+{
+	return &bm_pte[pte_index(addr)];
+}
+
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+	return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
+
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
+void __init early_ioremap_init(void)
+{
+	pmd_t *pmd;
+	int i;
+
+	if (early_ioremap_debug)
+		printk(KERN_INFO "early_ioremap_init()\n");
+
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+		slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
+	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+	memset(bm_pte, 0, sizeof(bm_pte));
+	make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
+	pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
+	/*
+	 * The boot-ioremap range spans multiple pmds, for which
+	 * we are not prepared:
+	 */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
+		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
+#undef __FIXADDR_TOP
+	if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
+		WARN_ON(1);
+		printk(KERN_WARNING "pmd %p != %p\n",
+		       pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
+		printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+			fix_to_virt(FIX_BTMAP_BEGIN));
+		printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END):   %08lx\n",
+			fix_to_virt(FIX_BTMAP_END));
+
+		printk(KERN_WARNING "FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
+		printk(KERN_WARNING "FIX_BTMAP_BEGIN:     %d\n",
+		       FIX_BTMAP_BEGIN);
+	}
+}
+
+void __init early_ioremap_reset(void)
+{
+	after_paging_init = 1;
+}
+
+static void __init __early_set_fixmap(enum fixed_addresses idx,
+				      phys_addr_t phys, pgprot_t flags)
+{
+	unsigned long addr = __fix_to_virt(idx);
+	pte_t *pte;
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+	pte = early_ioremap_pte(addr);
+
+	if (pgprot_val(flags))
+		set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
+	else
+		pte_clear(&init_mm, addr, pte);
+	__flush_tlb_one(addr);
+}
+
+static inline void __init early_set_fixmap(enum fixed_addresses idx,
+					   phys_addr_t phys, pgprot_t prot)
+{
+	if (after_paging_init)
+		__set_fixmap(idx, phys, prot);
+	else
+		__early_set_fixmap(idx, phys, prot);
+}
+
+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
+{
+	if (after_paging_init)
+		clear_fixmap(idx);
+	else
+		__early_set_fixmap(idx, 0, __pgprot(0));
+}
+
+static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
+#ifndef CONFIG_XEN
+void __init fixup_early_ioremap(void)
+{
+	int i;
+
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+		if (prev_map[i]) {
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	early_ioremap_init();
+}
+#endif
+
+static int __init check_early_ioremap_leak(void)
+{
+	int count = 0;
+	int i;
+
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+		if (prev_map[i])
+			count++;
+
+	if (!count)
+		return 0;
+	WARN(1, KERN_WARNING
+	       "Debug warning: early ioremap leak of %d areas detected.\n",
+		count);
+	printk(KERN_WARNING
+		"please boot with early_ioremap_debug and report the dmesg.\n");
+
+	return 1;
+}
+late_initcall(check_early_ioremap_leak);
+
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
+{
+	unsigned long offset;
+	resource_size_t last_addr;
+	unsigned int nrpages;
+	enum fixed_addresses idx0, idx;
+	int i, slot;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	slot = -1;
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+		if (!prev_map[i]) {
+			slot = i;
+			break;
+		}
+	}
+
+	if (slot < 0) {
+		printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n",
+			 (u64)phys_addr, size);
+		WARN_ON(1);
+		return NULL;
+	}
+
+	if (early_ioremap_debug) {
+		printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ",
+		       (u64)phys_addr, size, slot);
+		dump_stack();
+	}
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	prev_size[slot] = size;
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr + 1) - phys_addr;
+
+	/*
+	 * Mappings have to fit in the FIX_BTMAP area.
+	 */
+	nrpages = size >> PAGE_SHIFT;
+	if (nrpages > NR_FIX_BTMAPS) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	/*
+	 * Ok, go for it..
+	 */
+	idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+	idx = idx0;
+	while (nrpages > 0) {
+		early_set_fixmap(idx, phys_addr, prot);
+		phys_addr += PAGE_SIZE;
+		--idx;
+		--nrpages;
+	}
+	if (early_ioremap_debug)
+		printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
+
+	prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
+	return prev_map[slot];
+}
+
+/* Remap an IO device */
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+	/*
+	 * Don't remap the low PCI/ISA area, it's always mapped.
+	 */
+	if (is_initial_xendomain() && is_ISA_range(phys_addr, phys_addr + size - 1))
+		return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
+
+	return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
+}
+
+/* Remap memory */
+void __init __iomem *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+	return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL);
+}
+
+void __init __iomem *
+early_memremap_ro(resource_size_t phys_addr, unsigned long size)
+{
+	return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL_RO);
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+	unsigned long virt_addr;
+	unsigned long offset;
+	unsigned int nrpages;
+	enum fixed_addresses idx;
+	int i, slot;
+
+	/*
+	 * early_ioremap special-cases the PCI/ISA range by not instantiating a
+	 * vm_area and by simply returning an address into the kernel mapping
+	 * of ISA space.   So handle that here.
+	 */
+	if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN)
+	    && (unsigned long)addr < fix_to_virt(FIX_ISAMAP_END - 1))
+		return;
+
+	slot = -1;
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+		if (prev_map[i] == addr) {
+			slot = i;
+			break;
+		}
+	}
+
+	if (slot < 0) {
+		printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
+			 addr, size);
+		WARN_ON(1);
+		return;
+	}
+
+	if (prev_size[slot] != size) {
+		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+			 addr, size, slot, prev_size[slot]);
+		WARN_ON(1);
+		return;
+	}
+
+	if (early_ioremap_debug) {
+		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
+		       size, slot);
+		dump_stack();
+	}
+
+	virt_addr = (unsigned long)addr;
+	if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
+		WARN_ON(1);
+		return;
+	}
+	offset = virt_addr & ~PAGE_MASK;
+	nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
+
+	idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+	while (nrpages > 0) {
+		early_clear_fixmap(idx);
+		--idx;
+		--nrpages;
+	}
+	prev_map[slot] = NULL;
+}
diff --git a/arch/x86/mm/pageattr-xen.c b/arch/x86/mm/pageattr-xen.c
new file mode 100644
index 0000000..e22daa8
--- /dev/null
+++ b/arch/x86/mm/pageattr-xen.c
@@ -0,0 +1,1545 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pfn.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/pci.h>
+
+#include <asm/e820.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#include <asm/pat.h>
+
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
+struct cpa_data {
+	unsigned long	*vaddr;
+	pgprot_t	mask_set;
+	pgprot_t	mask_clr;
+	int		numpages;
+	int		flags;
+	unsigned long	pfn;
+	unsigned	force_split : 1;
+	int		curpage;
+	struct page	**pages;
+};
+
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4
+
+#ifdef CONFIG_PROC_FS
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void update_page_count(int level, unsigned long pages)
+{
+	/* Protect against CPA */
+	spin_lock(&pgd_lock);
+	direct_pages_count[level] += pages;
+	spin_unlock(&pgd_lock);
+}
+
+static void split_page_count(int level)
+{
+	direct_pages_count[level]--;
+	direct_pages_count[level - 1] += PTRS_PER_PTE;
+}
+
+void arch_report_meminfo(struct seq_file *m)
+{
+	seq_printf(m, "DirectMap4k:    %8lu kB\n",
+			direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+	seq_printf(m, "DirectMap2M:    %8lu kB\n",
+			direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+	seq_printf(m, "DirectMap4M:    %8lu kB\n",
+			direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
+#ifdef CONFIG_X86_64
+	if (direct_gbpages)
+		seq_printf(m, "DirectMap1G:    %8lu kB\n",
+			direct_pages_count[PG_LEVEL_1G] << 20);
+#endif
+}
+#else
+static inline void split_page_count(int level) { }
+#endif
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long highmap_start_pfn(void)
+{
+	return __pa(_text) >> PAGE_SHIFT;
+}
+
+static inline unsigned long highmap_end_pfn(void)
+{
+	return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+	return addr >= start && addr < end;
+}
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @addr:	virtual start address
+ * @size:	number of bytes to flush
+ *
+ * clflush is an unordered instruction which needs fencing with mfence
+ * to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+	void *vend = vaddr + size - 1;
+
+	mb();
+
+	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
+		clflush(vaddr);
+	/*
+	 * Flush any possible final partial cacheline:
+	 */
+	clflush(vend);
+
+	mb();
+}
+EXPORT_SYMBOL_GPL(clflush_cache_range);
+
+static void __cpa_flush_all(void *arg)
+{
+	unsigned long cache = (unsigned long)arg;
+
+	/*
+	 * Flush all to work around Errata in early athlons regarding
+	 * large page flushing.
+	 */
+	__flush_tlb_all();
+
+	if (cache && boot_cpu_data.x86 >= 4)
+		wbinvd();
+}
+
+static void cpa_flush_all(unsigned long cache)
+{
+	BUG_ON(irqs_disabled());
+
+	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+	/*
+	 * We could optimize that further and do individual per page
+	 * tlb invalidates for a low number of pages. Caveat: we must
+	 * flush the high aliases on 64bit as well.
+	 */
+	__flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
+{
+	unsigned int i, level;
+	unsigned long addr;
+
+	BUG_ON(irqs_disabled());
+	WARN_ON(PAGE_ALIGN(start) != start);
+
+	on_each_cpu(__cpa_flush_range, NULL, 1);
+
+	if (!cache)
+		return;
+
+	/*
+	 * We only need to flush on one CPU,
+	 * clflush is a MESI-coherent instruction that
+	 * will cause all other CPUs to flush the same
+	 * cachelines:
+	 */
+	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+		pte_t *pte = lookup_address(addr, &level);
+
+		/*
+		 * Only flush present addresses:
+		 */
+		if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
+			clflush_cache_range((void *) addr, PAGE_SIZE);
+	}
+}
+
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+			    int in_flags, struct page **pages)
+{
+	unsigned int i, level;
+	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
+
+	BUG_ON(irqs_disabled());
+
+	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
+
+	if (!cache || do_wbinvd)
+		return;
+
+	/*
+	 * We only need to flush on one CPU,
+	 * clflush is a MESI-coherent instruction that
+	 * will cause all other CPUs to flush the same
+	 * cachelines:
+	 */
+	for (i = 0; i < numpages; i++) {
+		unsigned long addr;
+		pte_t *pte;
+
+		if (in_flags & CPA_PAGES_ARRAY)
+			addr = (unsigned long)page_address(pages[i]);
+		else
+			addr = start[i];
+
+		pte = lookup_address(addr, &level);
+
+		/*
+		 * Only flush present addresses:
+		 */
+		if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
+			clflush_cache_range((void *)addr, PAGE_SIZE);
+	}
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
+				   unsigned long pfn)
+{
+	pgprot_t forbidden = __pgprot(0);
+
+	/*
+	 * The BIOS area between 640k and 1Mb needs to be executable for
+	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+	 */
+#ifdef CONFIG_PCI_BIOS
+	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+		pgprot_val(forbidden) |= _PAGE_NX;
+#endif
+
+	/*
+	 * The kernel text needs to be executable for obvious reasons
+	 * Does not cover __inittext since that is gone later on. On
+	 * 64bit we do not enforce !NX on the low mapping
+	 */
+	if (within(address, (unsigned long)_text, (unsigned long)_etext))
+		pgprot_val(forbidden) |= _PAGE_NX;
+
+	/*
+	 * The .rodata section needs to be read-only. Using the pfn
+	 * catches all aliases.
+	 */
+	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
+		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
+		pgprot_val(forbidden) |= _PAGE_RW;
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && !defined(CONFIG_XEN)
+	/*
+	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+	 * kernel text mappings for the large page aligned text, rodata sections
+	 * will be always read-only. For the kernel identity mappings covering
+	 * the holes caused by this alignment can be anything that user asks.
+	 *
+	 * This will preserve the large page mappings for kernel text/data
+	 * at no extra cost.
+	 */
+	if (kernel_set_to_readonly &&
+	    within(address, (unsigned long)_text,
+		   (unsigned long)__end_rodata_hpage_align)) {
+		unsigned int level;
+
+		/*
+		 * Don't enforce the !RW mapping for the kernel text mapping,
+		 * if the current mapping is already using small page mapping.
+		 * No need to work hard to preserve large page mappings in this
+		 * case.
+		 *
+		 * This also fixes the Linux Xen paravirt guest boot failure
+		 * (because of unexpected read-only mappings for kernel identity
+		 * mappings). In this paravirt guest case, the kernel text
+		 * mapping and the kernel identity mapping share the same
+		 * page-table pages. Thus we can't really use different
+		 * protections for the kernel text and identity mappings. Also,
+		 * these shared mappings are made of small page mappings.
+		 * Thus this don't enforce !RW mapping for small page kernel
+		 * text mapping logic will help Linux Xen parvirt guest boot
+		 * as well.
+		 */
+		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
+			pgprot_val(forbidden) |= _PAGE_RW;
+	}
+#endif
+
+	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+	return prot;
+}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud;
+	pmd_t *pmd;
+
+	*level = PG_LEVEL_NONE;
+
+	if (pgd_none(*pgd))
+		return NULL;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud))
+		return NULL;
+
+	*level = PG_LEVEL_1G;
+	if (pud_large(*pud) || !pud_present(*pud))
+		return (pte_t *)pud;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd))
+		return NULL;
+
+	*level = PG_LEVEL_2M;
+	if (pmd_large(*pmd) || !pmd_present(*pmd))
+		return (pte_t *)pmd;
+
+	*level = PG_LEVEL_4K;
+
+	return pte_offset_kernel(pmd, address);
+}
+EXPORT_SYMBOL_GPL(lookup_address);
+
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
+static void __set_pmd_pte(pte_t *kpte, unsigned long address,
+			  unsigned int level, pte_t pte)
+{
+	/* change init_mm */
+	switch(level) {
+	case PG_LEVEL_2M:
+		xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
+		break;
+#ifdef CONFIG_X86_64
+	case PG_LEVEL_1G:
+		xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
+		break;
+#endif
+	default:
+		BUG();
+	}
+#ifdef CONFIG_X86_32
+	if (!SHARED_KERNEL_PMD) {
+		struct page *page;
+
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			pud_t *pud;
+			pmd_t *pmd;
+
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			pud = pud_offset(pgd, address);
+			pmd = pmd_offset(pud, address);
+			xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
+		}
+	}
+#endif
+}
+
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+			struct cpa_data *cpa)
+{
+	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
+	pte_t new_pte, old_pte, *tmp;
+	pgprot_t old_prot, new_prot, req_prot;
+	int i, do_split = 1;
+	unsigned int level;
+
+	if (cpa->force_split)
+		return 1;
+
+	spin_lock(&pgd_lock);
+	/*
+	 * Check for races, another CPU might have split this page
+	 * up already:
+	 */
+	tmp = lookup_address(address, &level);
+	if (tmp != kpte)
+		goto out_unlock;
+
+	switch (level) {
+	case PG_LEVEL_2M:
+		psize = PMD_PAGE_SIZE;
+		pmask = PMD_PAGE_MASK;
+		break;
+#ifdef CONFIG_X86_64
+	case PG_LEVEL_1G:
+		psize = PUD_PAGE_SIZE;
+		pmask = PUD_PAGE_MASK;
+		break;
+#endif
+	default:
+		do_split = -EINVAL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Calculate the number of pages, which fit into this large
+	 * page starting at address:
+	 */
+	nextpage_addr = (address + psize) & pmask;
+	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+	if (numpages < cpa->numpages)
+		cpa->numpages = numpages;
+
+	/*
+	 * We are safe now. Check whether the new pgprot is the same:
+	 */
+	old_pte = *kpte;
+	old_prot = new_prot = req_prot = pte_pgprot(old_pte);
+
+	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
+
+	/*
+	 * old_pte points to the large page base address. So we need
+	 * to add the offset of the virtual address:
+	 */
+	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
+	cpa->pfn = pfn;
+
+	new_prot = static_protections(req_prot, address, pfn);
+
+	/*
+	 * We need to check the full range, whether
+	 * static_protection() requires a different pgprot for one of
+	 * the pages in the range we try to preserve:
+	 */
+	addr = address & pmask;
+	pfn = pte_pfn(old_pte);
+	for (i = 0; i < (psize >> PAGE_SHIFT) && pfn < max_mapnr;
+	     i++, addr += PAGE_SIZE, pfn++) {
+		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
+
+		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+			goto out_unlock;
+	}
+
+	/*
+	 * If there are no changes, return. maxpages has been updated
+	 * above:
+	 */
+	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+		do_split = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * We need to change the attributes. Check, whether we can
+	 * change the large page in one go. We request a split, when
+	 * the address is not aligned and the number of pages is
+	 * smaller than the number of pages in the large page. Note
+	 * that we limited the number of possible pages already to
+	 * the number of pages in the large page.
+	 */
+	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
+		/*
+		 * The address is aligned and the number of pages
+		 * covers the full page.
+		 */
+		new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
+		__set_pmd_pte(kpte, address, level, new_pte);
+		cpa->flags |= CPA_FLUSHTLB;
+		do_split = 0;
+	}
+
+out_unlock:
+	spin_unlock(&pgd_lock);
+
+	return do_split;
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+	unsigned long mfn, mfninc = 1;
+	unsigned int i, level;
+	pte_t *pbase, *tmp;
+	pgprot_t ref_prot;
+	struct page *base;
+
+	if (!debug_pagealloc)
+		spin_unlock(&cpa_lock);
+	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	if (!debug_pagealloc)
+		spin_lock(&cpa_lock);
+	if (!base)
+		return -ENOMEM;
+
+	spin_lock(&pgd_lock);
+	/*
+	 * Check for races, another CPU might have split this page
+	 * up for us already:
+	 */
+	tmp = lookup_address(address, &level);
+	if (tmp != kpte)
+		goto out_unlock;
+
+	pbase = (pte_t *)page_address(base);
+	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+	/*
+	 * If we ever want to utilize the PAT bit, we need to
+	 * update this function to make sure it's converted from
+	 * bit 12 to bit 7 when we cross from the 2MB level to
+	 * the 4K level:
+	 */
+	WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
+
+#ifdef CONFIG_X86_64
+	if (level == PG_LEVEL_1G) {
+		mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+		pgprot_val(ref_prot) |= _PAGE_PSE;
+	}
+#endif
+
+	if (address >= (unsigned long)__va(0) &&
+		address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+		split_page_count(level);
+
+#ifdef CONFIG_X86_64
+	if (address >= (unsigned long)__va(1UL<<32) &&
+		address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
+		split_page_count(level);
+#endif
+
+	/*
+	 * Get the target mfn from the original entry:
+	 */
+	mfn = __pte_mfn(*kpte);
+	for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
+		set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
+
+	/*
+	 * Install the new, split up pagetable.
+	 *
+	 * We use the standard kernel pagetable protections for the new
+	 * pagetable protections, the actual ptes set above control the
+	 * primary protection behavior:
+	 */
+	if (!xen_feature(XENFEAT_writable_page_tables) &&
+	    HYPERVISOR_update_va_mapping((unsigned long)pbase,
+					 mk_pte(base, PAGE_KERNEL_RO), 0))
+		BUG();
+	__set_pmd_pte(kpte, address, level, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+	/*
+	 * Intel Atom errata AAH41 workaround.
+	 *
+	 * The real fix should be in hw or in a microcode update, but
+	 * we also probabilistically try to reduce the window of having
+	 * a large TLB mixed with 4K TLBs while instruction fetches are
+	 * going on.
+	 */
+	__flush_tlb_all();
+
+	base = NULL;
+
+out_unlock:
+	/*
+	 * If we dropped out via the lookup_address check under
+	 * pgd_lock then stick the page back into the pool:
+	 */
+	if (base)
+		__free_page(base);
+	spin_unlock(&pgd_lock);
+
+	return 0;
+}
+
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+			       int primary)
+{
+	/*
+	 * Ignore all non primary paths.
+	 */
+	if (!primary)
+		return 0;
+
+	/*
+	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
+	 * to have holes.
+	 * Also set numpages to '1' indicating that we processed cpa req for
+	 * one virtual address page and its pfn. TBD: numpages can be set based
+	 * on the initial value and the level returned by lookup_address().
+	 */
+	if (within(vaddr, PAGE_OFFSET,
+		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+		cpa->numpages = 1;
+		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+		return 0;
+	} else {
+		WARN(1, KERN_WARNING "CPA: called for zero pte. "
+			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+			*cpa->vaddr);
+
+		return -EFAULT;
+	}
+}
+
+static int __change_page_attr(struct cpa_data *cpa, int primary)
+{
+	unsigned long address;
+	int do_split, err;
+	unsigned int level;
+	pte_t *kpte, old_pte;
+
+	if (cpa->flags & CPA_PAGES_ARRAY) {
+		struct page *page = cpa->pages[cpa->curpage];
+		if (unlikely(PageHighMem(page)))
+			return 0;
+		address = (unsigned long)page_address(page);
+	} else if (cpa->flags & CPA_ARRAY)
+		address = cpa->vaddr[cpa->curpage];
+	else
+		address = *cpa->vaddr;
+repeat:
+	kpte = lookup_address(address, &level);
+	if (!kpte)
+		return __cpa_process_fault(cpa, address, primary);
+
+	old_pte = *kpte;
+	if (!__pte_val(old_pte))
+		return __cpa_process_fault(cpa, address, primary);
+
+	if (level == PG_LEVEL_4K) {
+		pte_t new_pte;
+		pgprot_t new_prot = pte_pgprot(old_pte);
+		unsigned long mfn = __pte_mfn(old_pte);
+
+		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+
+		new_prot = static_protections(new_prot, address,
+					      mfn_to_local_pfn(mfn));
+
+		/*
+		 * We need to keep the mfn from the existing PTE,
+		 * after all we're only going to change it's attributes
+		 * not the memory it points to
+		 */
+		new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
+		cpa->pfn = mfn_to_local_pfn(mfn);
+		/*
+		 * Do we really change anything ?
+		 */
+		if (__pte_val(old_pte) != __pte_val(new_pte)) {
+			mmu_update_t u;
+
+			u.ptr = virt_to_machine(kpte);
+			u.val = __pte_val(new_pte);
+			WARN_ON_ONCE(arch_use_lazy_mmu_mode());
+			do {
+				err = HYPERVISOR_mmu_update(&u, 1, NULL,
+							    DOMID_SELF);
+				switch (err) {
+				case 0:
+					break;
+				case -ENOMEM:
+					BUG_ON(!primary);
+					BUG_ON(!((pgprot_val(cpa->mask_set) |
+						  pgprot_val(cpa->mask_clr)) &
+						 _PAGE_CACHE_MASK));
+					if (hypervisor_oom())
+						continue;
+					/* fall through */
+				default:
+					return err;
+				}
+			} while (err);
+			cpa->flags |= CPA_FLUSHTLB;
+		}
+		cpa->numpages = 1;
+		return 0;
+	}
+
+	/*
+	 * Check, whether we can keep the large page intact
+	 * and just change the pte:
+	 */
+	do_split = try_preserve_large_page(kpte, address, cpa);
+	/*
+	 * When the range fits into the existing large page,
+	 * return. cp->numpages and cpa->tlbflush have been updated in
+	 * try_large_page:
+	 */
+	if (do_split <= 0)
+		return do_split;
+
+	/*
+	 * We have to split the large page:
+	 */
+	err = split_large_page(kpte, address);
+	if (!err) {
+		/*
+	 	 * Do a global flush tlb after splitting the large page
+	 	 * and before we do the actual change page attribute in the PTE.
+	 	 *
+	 	 * With out this, we violate the TLB application note, that says
+	 	 * "The TLBs may contain both ordinary and large-page
+		 *  translations for a 4-KByte range of linear addresses. This
+		 *  may occur if software modifies the paging structures so that
+		 *  the page size used for the address range changes. If the two
+		 *  translations differ with respect to page frame or attributes
+		 *  (e.g., permissions), processor behavior is undefined and may
+		 *  be implementation-specific."
+	 	 *
+	 	 * We do this global tlb flush inside the cpa_lock, so that we
+		 * don't allow any other cpu, with stale tlb entries change the
+		 * page attribute in parallel, that also falls into the
+		 * just split large page entry.
+	 	 */
+		flush_tlb_all();
+		goto repeat;
+	}
+
+	return err;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+
+static int cpa_process_alias(struct cpa_data *cpa)
+{
+	struct cpa_data alias_cpa;
+	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
+	unsigned long vaddr;
+	int ret;
+
+	if (cpa->pfn >= max_pfn_mapped)
+		return 0;
+
+#ifdef CONFIG_X86_64
+	if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+		return 0;
+#endif
+	/*
+	 * No need to redo, when the primary call touched the direct
+	 * mapping already:
+	 */
+	if (cpa->flags & CPA_PAGES_ARRAY) {
+		struct page *page = cpa->pages[cpa->curpage];
+		if (unlikely(PageHighMem(page)))
+			return 0;
+		vaddr = (unsigned long)page_address(page);
+	} else if (cpa->flags & CPA_ARRAY)
+		vaddr = cpa->vaddr[cpa->curpage];
+	else
+		vaddr = *cpa->vaddr;
+
+	if (!(within(vaddr, PAGE_OFFSET,
+		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
+
+		alias_cpa = *cpa;
+		alias_cpa.vaddr = &laddr;
+		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+		ret = __change_page_attr_set_clr(&alias_cpa, 0);
+		if (ret)
+			return ret;
+	}
+
+#ifdef CONFIG_X86_64
+	/*
+	 * If the primary call didn't touch the high mapping already
+	 * and the physical address is inside the kernel map, we need
+	 * to touch the high mapped kernel as well:
+	 */
+	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
+	    within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
+		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
+					       __START_KERNEL_map;
+		alias_cpa = *cpa;
+		alias_cpa.vaddr = &temp_cpa_vaddr;
+		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+		/*
+		 * The high mapping range is imprecise, so ignore the
+		 * return value.
+		 */
+		__change_page_attr_set_clr(&alias_cpa, 0);
+	}
+#endif
+
+	return 0;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+{
+	int ret, numpages = cpa->numpages;
+
+	while (numpages) {
+		/*
+		 * Store the remaining nr of pages for the large page
+		 * preservation check.
+		 */
+		cpa->numpages = numpages;
+		/* for array changes, we can't use large page */
+		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
+			cpa->numpages = 1;
+
+		if (!debug_pagealloc)
+			spin_lock(&cpa_lock);
+		ret = __change_page_attr(cpa, checkalias);
+		if (!debug_pagealloc)
+			spin_unlock(&cpa_lock);
+		if (ret)
+			return ret;
+
+		if (checkalias) {
+			ret = cpa_process_alias(cpa);
+			if (ret)
+				return ret;
+		}
+
+		/*
+		 * Adjust the number of pages with the result of the
+		 * CPA operation. Either a large page has been
+		 * preserved or a single page update happened.
+		 */
+		BUG_ON(cpa->numpages > numpages);
+		numpages -= cpa->numpages;
+		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
+			cpa->curpage++;
+		else
+			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
+
+	}
+	return 0;
+}
+
+static inline int cache_attr(pgprot_t attr)
+{
+	return pgprot_val(attr) &
+		(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
+}
+
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
+				    pgprot_t mask_set, pgprot_t mask_clr,
+				    int force_split, int in_flag,
+				    struct page **pages)
+{
+	struct cpa_data cpa;
+	int ret, cache, checkalias;
+	unsigned long baddr = 0;
+
+	/*
+	 * Check, if we are requested to change a not supported
+	 * feature:
+	 */
+	mask_set = canon_pgprot(mask_set);
+	mask_clr = canon_pgprot(mask_clr);
+	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
+		return 0;
+
+	/* Ensure we are PAGE_SIZE aligned */
+	if (in_flag & CPA_ARRAY) {
+		int i;
+		for (i = 0; i < numpages; i++) {
+			if (addr[i] & ~PAGE_MASK) {
+				addr[i] &= PAGE_MASK;
+				WARN_ON_ONCE(1);
+			}
+		}
+	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
+		/*
+		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+		 * No need to cehck in that case
+		 */
+		if (*addr & ~PAGE_MASK) {
+			*addr &= PAGE_MASK;
+			/*
+			 * People should not be passing in unaligned addresses:
+			 */
+			WARN_ON_ONCE(1);
+		}
+		/*
+		 * Save address for cache flush. *addr is modified in the call
+		 * to __change_page_attr_set_clr() below.
+		 */
+		baddr = *addr;
+	}
+
+	/* Must avoid aliasing mappings in the highmem code */
+	kmap_flush_unused();
+
+	vm_unmap_aliases();
+
+	cpa.vaddr = addr;
+	cpa.pages = pages;
+	cpa.numpages = numpages;
+	cpa.mask_set = mask_set;
+	cpa.mask_clr = mask_clr;
+	cpa.flags = 0;
+	cpa.curpage = 0;
+	cpa.force_split = force_split;
+
+	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+		cpa.flags |= in_flag;
+
+	/* No alias checking for _NX bit modifications */
+	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+
+	ret = __change_page_attr_set_clr(&cpa, checkalias);
+
+	/*
+	 * Check whether we really changed something:
+	 */
+	if (!(cpa.flags & CPA_FLUSHTLB))
+		goto out;
+
+	/*
+	 * No need to flush, when we did not set any of the caching
+	 * attributes:
+	 */
+	cache = cache_attr(mask_set);
+
+	/*
+	 * On success we use clflush, when the CPU supports it to
+	 * avoid the wbindv. If the CPU does not support it and in the
+	 * error case we fall back to cpa_flush_all (which uses
+	 * wbindv):
+	 */
+	if (!ret && cpu_has_clflush) {
+		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+			cpa_flush_array(addr, numpages, cache,
+					cpa.flags, pages);
+		} else
+			cpa_flush_range(baddr, numpages, cache);
+	} else
+		cpa_flush_all(cache);
+
+out:
+	return ret;
+}
+
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
+				       pgprot_t mask, int array)
+{
+	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
+		(array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
+					 pgprot_t mask, int array)
+{
+	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
+		(array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+				       pgprot_t mask)
+{
+	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+		CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+					 pgprot_t mask)
+{
+	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+		CPA_PAGES_ARRAY, pages);
+}
+
+#ifdef CONFIG_XEN
+static void _free_memtype(u64 pstart, u64 pend)
+{
+	u64 pa = pstart &= __PHYSICAL_MASK;
+	u64 ma = phys_to_machine(pa);
+
+	while ((pa += PAGE_SIZE) < pend) {
+		if (phys_to_machine(pa) != ma + (pa - pstart)) {
+			free_memtype(ma, ma + (pa - pstart));
+			pstart = pa;
+			ma = phys_to_machine(pa);
+		}
+	}
+	free_memtype(ma, ma + (pend - pstart));
+}
+#define free_memtype _free_memtype
+
+static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
+{
+	u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
+	u64 ma = phys_to_machine(pa);
+	int rc = 0;
+
+	while ((pa += PAGE_SIZE) < pend) {
+		if (phys_to_machine(pa) != ma + (pa - pcur)) {
+			rc = reserve_memtype(ma, ma + (pa - pcur),
+					     req_type, NULL);
+			if (rc)
+				break;
+			pcur = pa;
+			ma = phys_to_machine(pa);
+		}
+	}
+	if (likely(!rc))
+		rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
+
+	if (unlikely(!rc) && pstart < pcur)
+		_free_memtype(pstart, pcur);
+
+	return rc;
+}
+#define reserve_memtype(s, e, r, n) \
+	_reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
+#endif
+
+int _set_memory_uc(unsigned long addr, int numpages)
+{
+	/*
+	 * for now UC MINUS. see comments in ioremap_nocache()
+	 */
+	return change_page_attr_set(&addr, numpages,
+				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+	int ret;
+
+	/*
+	 * for now UC MINUS. see comments in ioremap_nocache()
+	 */
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+			    _PAGE_CACHE_UC_MINUS, NULL);
+	if (ret)
+		goto out_err;
+
+	ret = _set_memory_uc(addr, numpages);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+	return ret;
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+static int _set_memory_array(unsigned long *addr, int addrinarray,
+		unsigned long new_type)
+{
+	int i, j;
+	int ret;
+
+	/*
+	 * for now UC MINUS. see comments in ioremap_nocache()
+	 */
+	for (i = 0; i < addrinarray; i++) {
+		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+					new_type, NULL);
+		if (ret)
+			goto out_free;
+	}
+
+	ret = change_page_attr_set(addr, addrinarray,
+				    __pgprot(_PAGE_CACHE_UC_MINUS), 1);
+
+	if (!ret && new_type == _PAGE_CACHE_WC)
+		ret = change_page_attr_set_clr(addr, addrinarray,
+					       __pgprot(_PAGE_CACHE_WC),
+					       __pgprot(_PAGE_CACHE_MASK),
+					       0, CPA_ARRAY, NULL);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	for (j = 0; j < i; j++)
+		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+
+	return ret;
+}
+
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_memory_array_uc);
+
+int set_memory_array_wc(unsigned long *addr, int addrinarray)
+{
+	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC);
+}
+EXPORT_SYMBOL(set_memory_array_wc);
+
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+	int ret;
+	unsigned long addr_copy = addr;
+
+	ret = change_page_attr_set(&addr, numpages,
+				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+	if (!ret) {
+		ret = change_page_attr_set_clr(&addr_copy, numpages,
+					       __pgprot(_PAGE_CACHE_WC),
+					       __pgprot(_PAGE_CACHE_MASK),
+					       0, 0, NULL);
+	}
+	return ret;
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+	int ret;
+
+	if (!pat_enabled)
+		return set_memory_uc(addr, numpages);
+
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+		_PAGE_CACHE_WC, NULL);
+	if (ret)
+		goto out_err;
+
+	ret = _set_memory_wc(addr, numpages);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+	return ret;
+}
+EXPORT_SYMBOL(set_memory_wc);
+
+int _set_memory_wb(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(&addr, numpages,
+				      __pgprot(_PAGE_CACHE_MASK), 0);
+}
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+	int ret;
+
+	ret = _set_memory_wb(addr, numpages);
+	if (ret)
+		return ret;
+
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+	return 0;
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+	int i;
+	int ret;
+
+	ret = change_page_attr_clear(addr, addrinarray,
+				      __pgprot(_PAGE_CACHE_MASK), 1);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < addrinarray; i++)
+		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
+
+	return 0;
+}
+EXPORT_SYMBOL(set_memory_array_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return 0;
+
+	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return 0;
+
+	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+EXPORT_SYMBOL_GPL(set_memory_ro);
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+EXPORT_SYMBOL_GPL(set_memory_rw);
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
+}
+
+int set_memory_4k(unsigned long addr, int numpages)
+{
+	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+					__pgprot(0), 1, 0, NULL);
+}
+
+int set_pages_uc(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+static int _set_pages_array(struct page **pages, int addrinarray,
+		unsigned long new_type)
+{
+	unsigned long start;
+	unsigned long end;
+	int i;
+	int free_idx;
+	int ret;
+
+	for (i = 0; i < addrinarray; i++) {
+		if (PageHighMem(pages[i]))
+			continue;
+		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+		end = start + PAGE_SIZE;
+		if (reserve_memtype(start, end, new_type, NULL))
+			goto err_out;
+	}
+
+	ret = cpa_set_pages_array(pages, addrinarray,
+			__pgprot(_PAGE_CACHE_UC_MINUS));
+	if (!ret && new_type == _PAGE_CACHE_WC)
+		ret = change_page_attr_set_clr(NULL, addrinarray,
+					       __pgprot(_PAGE_CACHE_WC),
+					       __pgprot(_PAGE_CACHE_MASK),
+					       0, CPA_PAGES_ARRAY, pages);
+	if (ret)
+		goto err_out;
+	return 0; /* Success */
+err_out:
+	free_idx = i;
+	for (i = 0; i < free_idx; i++) {
+		if (PageHighMem(pages[i]))
+			continue;
+		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+		end = start + PAGE_SIZE;
+		free_memtype(start, end);
+	}
+	return -EINVAL;
+}
+
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
+int set_pages_array_wc(struct page **pages, int addrinarray)
+{
+	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC);
+}
+EXPORT_SYMBOL(set_pages_array_wc);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+	int retval;
+	unsigned long start;
+	unsigned long end;
+	int i;
+
+	retval = cpa_clear_pages_array(pages, addrinarray,
+			__pgprot(_PAGE_CACHE_MASK));
+	if (retval)
+		return retval;
+
+	for (i = 0; i < addrinarray; i++) {
+		if (PageHighMem(pages[i]))
+			continue;
+		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+		end = start + PAGE_SIZE;
+		free_memtype(start, end);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_rw(addr, numpages);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+	unsigned long tempaddr = (unsigned long) page_address(page);
+	struct cpa_data cpa = { .vaddr = &tempaddr,
+				.numpages = numpages,
+				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+				.mask_clr = __pgprot(0),
+				.flags = 0};
+
+	/*
+	 * No alias checking needed for setting present flag. otherwise,
+	 * we may need to break large pages for 64-bit kernel text
+	 * mappings (this adds to complexity if we want to do this from
+	 * atomic context especially). Let's keep it simple!
+	 */
+	return __change_page_attr_set_clr(&cpa, 0);
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+	unsigned long tempaddr = (unsigned long) page_address(page);
+	struct cpa_data cpa = { .vaddr = &tempaddr,
+				.numpages = numpages,
+				.mask_set = __pgprot(0),
+				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+				.flags = 0};
+
+	/*
+	 * No alias checking needed for setting not present flag. otherwise,
+	 * we may need to break large pages for 64-bit kernel text
+	 * mappings (this adds to complexity if we want to do this from
+	 * atomic context especially). Let's keep it simple!
+	 */
+	return __change_page_attr_set_clr(&cpa, 0);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (PageHighMem(page))
+		return;
+	if (!enable) {
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
+	}
+
+	/*
+	 * The return value is ignored as the calls cannot fail.
+	 * Large pages for identity mappings are not used at boot time
+	 * and hence no memory allocations during large page split.
+	 */
+	if (enable)
+		__set_pages_p(page, numpages);
+	else
+		__set_pages_np(page, numpages);
+
+	/*
+	 * We should perform an IPI and flush all tlbs,
+	 * but that can deadlock->flush only current cpu:
+	 */
+	__flush_tlb_all();
+}
+
+#ifdef CONFIG_HIBERNATION
+
+bool kernel_page_present(struct page *page)
+{
+	unsigned int level;
+	pte_t *pte;
+
+	if (PageHighMem(page))
+		return false;
+
+	pte = lookup_address((unsigned long)page_address(page), &level);
+	return (__pte_val(*pte) & _PAGE_PRESENT);
+}
+
+#endif /* CONFIG_HIBERNATION */
+
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+static inline int in_secondary_range(unsigned long va)
+{
+#ifdef CONFIG_X86_64
+	return va >= VMALLOC_START && va < VMALLOC_END;
+#else
+	return va >= (unsigned long)high_memory;
+#endif
+}
+
+static void __make_page_readonly(unsigned long va)
+{
+	pte_t *pte;
+	unsigned int level;
+
+	pte = lookup_address(va, &level);
+	BUG_ON(!pte || level != PG_LEVEL_4K);
+	if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
+		BUG();
+	if (in_secondary_range(va)) {
+		unsigned long pfn = pte_pfn(*pte);
+
+#ifdef CONFIG_HIGHMEM
+		if (pfn >= highstart_pfn)
+			kmap_flush_unused(); /* flush stale writable kmaps */
+		else
+#endif
+			__make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
+	}
+}
+
+static void __make_page_writable(unsigned long va)
+{
+	pte_t *pte;
+	unsigned int level;
+
+	pte = lookup_address(va, &level);
+	BUG_ON(!pte || level != PG_LEVEL_4K);
+	if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
+		BUG();
+	if (in_secondary_range(va)) {
+		unsigned long pfn = pte_pfn(*pte);
+
+#ifdef CONFIG_HIGHMEM
+		if (pfn < highstart_pfn)
+#endif
+			__make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
+	}
+}
+
+void make_page_readonly(void *va, unsigned int feature)
+{
+	if (!xen_feature(feature))
+		__make_page_readonly((unsigned long)va);
+}
+
+void make_page_writable(void *va, unsigned int feature)
+{
+	if (!xen_feature(feature))
+		__make_page_writable((unsigned long)va);
+}
+
+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
+{
+	unsigned long addr;
+
+	if (xen_feature(feature))
+		return;
+
+	for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
+		__make_page_readonly(addr);
+}
+
+void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
+{
+	unsigned long addr;
+
+	if (xen_feature(feature))
+		return;
+
+	for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
+		__make_page_writable(addr);
+}
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
diff --git a/arch/x86/mm/pat-xen.c b/arch/x86/mm/pat-xen.c
new file mode 100644
index 0000000..6207e52
--- /dev/null
+++ b/arch/x86/mm/pat-xen.c
@@ -0,0 +1,840 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *          Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rbtree.h>
+
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
+#include <asm/pgtable.h>
+#include <asm/fcntl.h>
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+#include <asm/io.h>
+
+#include "pat_internal.h"
+
+#ifdef CONFIG_X86_PAT
+int __read_mostly pat_enabled = 1;
+
+static inline void pat_disable(const char *reason)
+{
+	pat_enabled = 0;
+	printk(KERN_INFO "%s\n", reason);
+}
+
+static int __init nopat(char *str)
+{
+	pat_disable("PAT support disabled.");
+	return 0;
+}
+early_param("nopat", nopat);
+#else
+static inline void pat_disable(const char *reason)
+{
+	(void)reason;
+}
+#endif
+
+
+int pat_debug_enable;
+
+static int __init pat_debug_setup(char *str)
+{
+	pat_debug_enable = 1;
+	return 0;
+}
+__setup("debugpat", pat_debug_setup);
+
+static u64 __read_mostly boot_pat_state;
+
+enum {
+	PAT_UC = 0,		/* uncached */
+	PAT_WC = 1,		/* Write combining */
+	PAT_WT = 4,		/* Write Through */
+	PAT_WP = 5,		/* Write Protected */
+	PAT_WB = 6,		/* Write Back (default) */
+	PAT_UC_MINUS = 7,	/* UC, but can be overriden by MTRR */
+};
+
+#define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
+
+void pat_init(void)
+{
+	u64 pat;
+	bool boot_cpu = !boot_pat_state;
+
+	if (!pat_enabled)
+		return;
+
+	if (!cpu_has_pat) {
+		if (!boot_pat_state) {
+			pat_disable("PAT not supported by CPU.");
+			return;
+		} else {
+			/*
+			 * If this happens we are on a secondary CPU, but
+			 * switched to PAT on the boot CPU. We have no way to
+			 * undo PAT.
+			 */
+			printk(KERN_ERR "PAT enabled, "
+			       "but not supported by secondary CPU\n");
+			BUG();
+		}
+	}
+
+#ifndef CONFIG_XEN
+	/* Set PWT to Write-Combining. All other bits stay the same */
+	/*
+	 * PTE encoding used in Linux:
+	 *      PAT
+	 *      |PCD
+	 *      ||PWT
+	 *      |||
+	 *      000 WB		_PAGE_CACHE_WB
+	 *      001 WC		_PAGE_CACHE_WC
+	 *      010 UC-		_PAGE_CACHE_UC_MINUS
+	 *      011 UC		_PAGE_CACHE_UC
+	 * PAT bit unused
+	 */
+	pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+	      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+
+	/* Boot CPU check */
+	if (!boot_pat_state)
+		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+
+	wrmsrl(MSR_IA32_CR_PAT, pat);
+#else
+	/*
+	 * PAT settings are part of the hypervisor interface, and their
+	 * assignment cannot be changed.
+	 */
+	rdmsrl(MSR_IA32_CR_PAT, pat);
+	if (!boot_pat_state)
+		boot_pat_state = pat;
+#endif
+
+	if (boot_cpu)
+		printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+		       smp_processor_id(), boot_pat_state, pat);
+}
+
+#undef PAT
+
+static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */
+
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end);
+static inline u8 _mtrr_type_lookup(u64 start, u64 end)
+{
+	if (is_initial_xendomain())
+		return mtrr_type_lookup(start, end);
+	return pat_pagerange_is_ram(start, end) > 0
+	       ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE;
+}
+#define mtrr_type_lookup _mtrr_type_lookup
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
+{
+	/*
+	 * Look for MTRR hint to get the effective type in case where PAT
+	 * request is for WB.
+	 */
+	if (req_type == _PAGE_CACHE_WB) {
+		u8 mtrr_type;
+
+		mtrr_type = mtrr_type_lookup(start, end);
+		if (mtrr_type != MTRR_TYPE_WRBACK)
+			return _PAGE_CACHE_UC_MINUS;
+
+		return _PAGE_CACHE_WB;
+	}
+
+	return req_type;
+}
+
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
+{
+	int ram_page = 0, not_rampage = 0;
+	unsigned long page_nr;
+
+	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
+	     ++page_nr) {
+		/*
+		 * For legacy reasons, physical address range in the legacy ISA
+		 * region is tracked as non-RAM. This will allow users of
+		 * /dev/mem to map portions of legacy ISA region, even when
+		 * some of those portions are listed(or not even listed) with
+		 * different e820 types(RAM/reserved/..)
+		 */
+		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
+		    page_is_ram(mfn_to_local_pfn(page_nr)))
+			ram_page = 1;
+		else
+			not_rampage = 1;
+
+		if (ram_page == not_rampage)
+			return -1;
+	}
+
+	return ram_page;
+}
+
+/*
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * Here we do two pass:
+ * - Find the memtype of all the pages in the range, look for any conflicts
+ * - In case of no conflicts, set the new memtype for pages in the range
+ */
+static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
+				  unsigned long *new_type)
+{
+	struct page *page;
+	unsigned long mfn;
+
+	if (req_type == _PAGE_CACHE_UC) {
+		/* We do not support strong UC */
+		WARN_ON_ONCE(1);
+		req_type = _PAGE_CACHE_UC_MINUS;
+	}
+
+	for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) {
+		unsigned long type, pfn = mfn_to_local_pfn(mfn);
+
+		BUG_ON(!pfn_valid(pfn));
+		page = pfn_to_page(pfn);
+		type = get_page_memtype(page);
+		if (type != -1) {
+			printk(KERN_INFO "reserve_ram_pages_type failed "
+				"0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
+				start, end, type, req_type);
+			if (new_type)
+				*new_type = type;
+
+			return -EBUSY;
+		}
+	}
+
+	if (new_type)
+		*new_type = req_type;
+
+	for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) {
+		page = pfn_to_page(mfn_to_local_pfn(mfn));
+		set_page_memtype(page, req_type);
+	}
+	return 0;
+}
+
+static int free_ram_pages_type(u64 start, u64 end)
+{
+	struct page *page;
+	unsigned long mfn;
+
+	for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) {
+		unsigned long pfn = mfn_to_local_pfn(mfn);
+
+		BUG_ON(!pfn_valid(pfn));
+		page = pfn_to_page(pfn);
+		set_page_memtype(page, -1);
+	}
+	return 0;
+}
+
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_WB
+ * - _PAGE_CACHE_WC
+ * - _PAGE_CACHE_UC_MINUS
+ * - _PAGE_CACHE_UC
+ *
+ * If new_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If new_type is non-NULL, function will return
+ * available type in new_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
+int reserve_memtype(u64 start, u64 end, unsigned long req_type,
+		    unsigned long *new_type)
+{
+	struct memtype *new;
+	unsigned long actual_type;
+	int is_range_ram;
+	int err = 0;
+
+	BUG_ON(start >= end); /* end is exclusive */
+
+	if (!pat_enabled) {
+		/* This is identical to page table setting without PAT */
+		if (new_type) {
+			if (req_type == _PAGE_CACHE_WC)
+				*new_type = _PAGE_CACHE_UC_MINUS;
+			else
+				*new_type = req_type & _PAGE_CACHE_MASK;
+		}
+		return 0;
+	}
+
+	/* Low ISA region is always mapped WB in page table. No need to track */
+	if (x86_platform.is_untracked_pat_range(start, end)) {
+		if (new_type)
+			*new_type = _PAGE_CACHE_WB;
+		return 0;
+	}
+
+	/*
+	 * Call mtrr_lookup to get the type hint. This is an
+	 * optimization for /dev/mem mmap'ers into WB memory (BIOS
+	 * tools and ACPI tools). Use WB request for WB memory and use
+	 * UC_MINUS otherwise.
+	 */
+	actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
+
+	if (new_type)
+		*new_type = actual_type;
+
+	is_range_ram = pat_pagerange_is_ram(start, end);
+	if (is_range_ram == 1) {
+
+		err = reserve_ram_pages_type(start, end, req_type, new_type);
+
+		return err;
+	} else if (is_range_ram < 0) {
+		return -EINVAL;
+	}
+
+	new  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	new->start	= start;
+	new->end	= end;
+	new->type	= actual_type;
+
+	spin_lock(&memtype_lock);
+
+	err = rbt_memtype_check_insert(new, new_type);
+	if (err) {
+		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
+		       "track %s, req %s\n",
+		       start, end, cattr_name(new->type), cattr_name(req_type));
+		kfree(new);
+		spin_unlock(&memtype_lock);
+
+		return err;
+	}
+
+	spin_unlock(&memtype_lock);
+
+	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+		start, end, cattr_name(new->type), cattr_name(req_type),
+		new_type ? cattr_name(*new_type) : "-");
+
+	return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+	int err = -EINVAL;
+	int is_range_ram;
+	struct memtype *entry;
+
+	if (!pat_enabled)
+		return 0;
+
+	/* Low ISA region is always mapped WB. No need to track */
+	if (x86_platform.is_untracked_pat_range(start, end))
+		return 0;
+
+	is_range_ram = pat_pagerange_is_ram(start, end);
+	if (is_range_ram == 1) {
+
+		err = free_ram_pages_type(start, end);
+
+		return err;
+	} else if (is_range_ram < 0) {
+		return -EINVAL;
+	}
+
+	spin_lock(&memtype_lock);
+	entry = rbt_memtype_erase(start, end);
+	spin_unlock(&memtype_lock);
+
+	if (!entry) {
+		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
+			current->comm, current->pid, start, end);
+		return -EINVAL;
+	}
+
+	kfree(entry);
+
+	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+
+	return 0;
+}
+
+
+#ifndef CONFIG_XEN
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
+ * _PAGE_CACHE_UC
+ */
+static unsigned long lookup_memtype(u64 paddr)
+{
+	int rettype = _PAGE_CACHE_WB;
+	struct memtype *entry;
+
+	if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
+		return rettype;
+
+	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+		struct page *page;
+		page = pfn_to_page(paddr >> PAGE_SHIFT);
+		rettype = get_page_memtype(page);
+		/*
+		 * -1 from get_page_memtype() implies RAM page is in its
+		 * default state and not reserved, and hence of type WB
+		 */
+		if (rettype == -1)
+			rettype = _PAGE_CACHE_WB;
+
+		return rettype;
+	}
+
+	spin_lock(&memtype_lock);
+
+	entry = rbt_memtype_lookup(paddr);
+	if (entry != NULL)
+		rettype = entry->type;
+	else
+		rettype = _PAGE_CACHE_UC_MINUS;
+
+	spin_unlock(&memtype_lock);
+	return rettype;
+}
+#endif
+
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+			unsigned long *type)
+{
+	resource_size_t size = end - start;
+	unsigned long req_type = *type;
+	unsigned long new_type;
+	int ret;
+
+	WARN_ON_ONCE(iomem_map_sanity_check(start, size));
+
+	ret = reserve_memtype(start, end, req_type, &new_type);
+	if (ret)
+		goto out_err;
+
+	if (!is_new_memtype_allowed(start, size, req_type, new_type))
+		goto out_free;
+
+	if (kernel_map_sync_memtype(start, size, new_type) < 0)
+		goto out_free;
+
+	*type = new_type;
+	return 0;
+
+out_free:
+	free_memtype(start, end);
+	ret = -EBUSY;
+out_err:
+	return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+	free_memtype(start, end);
+}
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
+				unsigned long size, pgprot_t vma_prot)
+{
+	return vma_prot;
+}
+
+#ifdef CONFIG_STRICT_DEVMEM
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
+{
+	return 1;
+}
+#else
+/* This check is needed to avoid cache aliasing when PAT is enabled */
+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
+{
+	u64 from = ((u64)mfn) << PAGE_SHIFT;
+	u64 to = from + size;
+	u64 cursor = from;
+
+	if (!pat_enabled)
+		return 1;
+
+	while (cursor < to) {
+		if (!devmem_is_allowed(mfn)) {
+			printk(KERN_INFO
+		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+				current->comm, from, to);
+			return 0;
+		}
+		cursor += PAGE_SIZE;
+		mfn++;
+	}
+	return 1;
+}
+#endif /* CONFIG_STRICT_DEVMEM */
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
+				unsigned long size, pgprot_t *vma_prot)
+{
+	unsigned long flags = _PAGE_CACHE_WB;
+
+	if (!range_is_allowed(mfn, size))
+		return 0;
+
+	if (file->f_flags & O_DSYNC)
+		flags = _PAGE_CACHE_UC_MINUS;
+
+#ifndef CONFIG_X86_32
+#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
+	/*
+	 * On the PPro and successors, the MTRRs are used to set
+	 * memory types for physical addresses outside main memory,
+	 * so blindly setting UC or PWT on those pages is wrong.
+	 * For Pentiums and earlier, the surround logic should disable
+	 * caching for the high addresses through the KEN pin, but
+	 * we maintain the tradition of paranoia in this code.
+	 */
+	if (!pat_enabled &&
+	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
+	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
+	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
+	      boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
+	    (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+		flags = _PAGE_CACHE_UC;
+	}
+#endif
+#endif
+
+	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+			     flags);
+	return 1;
+}
+
+/*
+ * Change the memory type for the physial address range in kernel identity
+ * mapping space if that range is a part of identity map.
+ */
+int kernel_map_sync_memtype(u64 ma, unsigned long size, unsigned long flags)
+{
+	return ioremap_check_change_attr(ma >> PAGE_SHIFT, size, flags);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Internal interface to reserve a range of physical memory with prot.
+ * Reserved non RAM regions only and after successful reserve_memtype,
+ * this func also keeps identity mapping (if any) in sync with this new prot.
+ */
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
+				int strict_prot)
+{
+	int is_ram = 0;
+	int ret;
+	unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
+	unsigned long flags = want_flags;
+
+	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+
+	/*
+	 * reserve_pfn_range() for RAM pages. We do not refcount to keep
+	 * track of number of mappings of RAM pages. We can assert that
+	 * the type requested matches the type of first page in the range.
+	 */
+	if (is_ram) {
+		if (!pat_enabled)
+			return 0;
+
+		flags = lookup_memtype(paddr);
+		if (want_flags != flags) {
+			printk(KERN_WARNING
+			"%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+				current->comm, current->pid,
+				cattr_name(want_flags),
+				(unsigned long long)paddr,
+				(unsigned long long)(paddr + size),
+				cattr_name(flags));
+			*vma_prot = __pgprot((pgprot_val(*vma_prot) &
+					      (~_PAGE_CACHE_MASK)) |
+					     flags);
+		}
+		return 0;
+	}
+
+	ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
+	if (ret)
+		return ret;
+
+	if (flags != want_flags) {
+		if (strict_prot ||
+		    !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
+			free_memtype(paddr, paddr + size);
+			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
+				" for %Lx-%Lx, got %s\n",
+				current->comm, current->pid,
+				cattr_name(want_flags),
+				(unsigned long long)paddr,
+				(unsigned long long)(paddr + size),
+				cattr_name(flags));
+			return -EINVAL;
+		}
+		/*
+		 * We allow returning different type than the one requested in
+		 * non strict case.
+		 */
+		*vma_prot = __pgprot((pgprot_val(*vma_prot) &
+				      (~_PAGE_CACHE_MASK)) |
+				     flags);
+	}
+
+	if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
+		free_memtype(paddr, paddr + size);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Internal interface to free a range of physical memory.
+ * Frees non RAM regions only.
+ */
+static void free_pfn_range(u64 paddr, unsigned long size)
+{
+	int is_ram;
+
+	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+	if (is_ram == 0)
+		free_memtype(paddr, paddr + size);
+}
+
+/*
+ * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ *
+ * If the vma has a linear pfn mapping for the entire range, we get the prot
+ * from pte and reserve the entire vma range with single reserve_pfn_range call.
+ */
+int track_pfn_vma_copy(struct vm_area_struct *vma)
+{
+	resource_size_t paddr;
+	unsigned long prot;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;
+	pgprot_t pgprot;
+
+	if (is_linear_pfn_mapping(vma)) {
+		/*
+		 * reserve the whole chunk covered by vma. We need the
+		 * starting address and protection from pte.
+		 */
+		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+			WARN_ON_ONCE(1);
+			return -EINVAL;
+		}
+		pgprot = __pgprot(prot);
+		return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+	}
+
+	return 0;
+}
+
+/*
+ * track_pfn_vma_new is called when a _new_ pfn mapping is being established
+ * for physical range indicated by pfn and size.
+ *
+ * prot is passed in as a parameter for the new mapping. If the vma has a
+ * linear pfn mapping for the entire range reserve the entire vma range with
+ * single reserve_pfn_range call.
+ */
+int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
+			unsigned long pfn, unsigned long size)
+{
+	unsigned long flags;
+	resource_size_t paddr;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+	if (is_linear_pfn_mapping(vma)) {
+		/* reserve the whole chunk starting from vm_pgoff */
+		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+		return reserve_pfn_range(paddr, vma_size, prot, 0);
+	}
+
+	if (!pat_enabled)
+		return 0;
+
+	/* for vm_insert_pfn and friends, we set prot based on lookup */
+	flags = lookup_memtype(pfn << PAGE_SHIFT);
+	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+			 flags);
+
+	return 0;
+}
+
+/*
+ * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case size can be zero).
+ */
+void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+			unsigned long size)
+{
+	resource_size_t paddr;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+	if (is_linear_pfn_mapping(vma)) {
+		/* free the whole chunk starting from vm_pgoff */
+		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+		free_pfn_range(paddr, vma_size);
+		return;
+	}
+}
+#endif /* CONFIG_XEN */
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+	if (pat_enabled)
+		return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
+	else
+		return pgprot_noncached(prot);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
+
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+	struct memtype *print_entry;
+	int ret;
+
+	print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!print_entry)
+		return NULL;
+
+	spin_lock(&memtype_lock);
+	ret = rbt_memtype_copy_nth_element(print_entry, pos);
+	spin_unlock(&memtype_lock);
+
+	if (!ret) {
+		return print_entry;
+	} else {
+		kfree(print_entry);
+		return NULL;
+	}
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos == 0) {
+		++*pos;
+		seq_printf(seq, "PAT memtype list:\n");
+	}
+
+	return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+	struct memtype *print_entry = (struct memtype *)v;
+
+	seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+			print_entry->start, print_entry->end);
+	kfree(print_entry);
+
+	return 0;
+}
+
+static const struct seq_operations memtype_seq_ops = {
+	.start = memtype_seq_start,
+	.next  = memtype_seq_next,
+	.stop  = memtype_seq_stop,
+	.show  = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+	.open    = memtype_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+	if (pat_enabled) {
+		debugfs_create_file("pat_memtype_list", S_IRUSR,
+				    arch_debugfs_dir, NULL, &memtype_fops);
+	}
+	return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index 77e5ba1..dbed48d 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -21,6 +21,10 @@ static inline char *cattr_name(unsigned long flags)
 	case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
 	case _PAGE_CACHE_WB:		return "write-back";
 	case _PAGE_CACHE_WC:		return "write-combining";
+#ifdef CONFIG_XEN
+	case _PAGE_CACHE_WP:		return "write-protected";
+	case _PAGE_CACHE_WT:		return "write-through";
+#endif
 	default:			return "broken";
 	}
 }
diff --git a/arch/x86/mm/pgtable-xen.c b/arch/x86/mm/pgtable-xen.c
new file mode 100644
index 0000000..682c44f
--- /dev/null
+++ b/arch/x86/mm/pgtable-xen.c
@@ -0,0 +1,970 @@
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <xen/features.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/fixmap.h>
+#include <asm/hypervisor.h>
+#include <asm/mmu_context.h>
+
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
+#ifdef CONFIG_HIGHPTE
+#define PGALLOC_USER_GFP __GFP_HIGHMEM
+#else
+#define PGALLOC_USER_GFP 0
+#endif
+
+gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+	pte_t *pte = (pte_t *)__get_free_page(PGALLOC_GFP);
+	if (pte)
+		make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
+	return pte;
+}
+
+static void _pte_free(struct page *page, unsigned int order)
+{
+	BUG_ON(order);
+	__pte_free(page);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *pte;
+
+	pte = alloc_pages(__userpte_alloc_gfp, 0);
+	if (pte) {
+		pgtable_page_ctor(pte);
+		SetPageForeign(pte, _pte_free);
+		init_page_count(pte);
+	}
+	return pte;
+}
+
+static int __init setup_userpte(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	/*
+	 * "userpte=nohigh" disables allocation of user pagetables in
+	 * high memory.
+	 */
+	if (strcmp(arg, "nohigh") == 0)
+		__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+	else
+		return -EINVAL;
+	return 0;
+}
+early_param("userpte", setup_userpte);
+
+void __pte_free(pgtable_t pte)
+{
+	if (!PageHighMem(pte)) {
+		if (PagePinned(pte)) {
+			unsigned long pfn = page_to_pfn(pte);
+
+			if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
+							 pfn_pte(pfn,
+								 PAGE_KERNEL),
+							 0))
+				BUG();
+			ClearPagePinned(pte);
+		}
+	} else
+#ifdef CONFIG_HIGHPTE
+		ClearPagePinned(pte);
+#else
+		BUG();
+#endif
+
+	ClearPageForeign(pte);
+	init_page_count(pte);
+	pgtable_page_dtor(pte);
+	__free_page(pte);
+}
+
+void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+	pgtable_page_dtor(pte);
+	paravirt_release_pte(page_to_pfn(pte));
+	tlb_remove_page(tlb, pte);
+}
+
+#if PAGETABLE_LEVELS > 2
+static void _pmd_free(struct page *page, unsigned int order)
+{
+	BUG_ON(order);
+	__pmd_free(page);
+}
+
+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *pmd;
+
+	pmd = alloc_pages(PGALLOC_GFP, 0);
+	if (!pmd)
+		return NULL;
+	SetPageForeign(pmd, _pmd_free);
+	init_page_count(pmd);
+	return page_address(pmd);
+}
+
+void __pmd_free(pgtable_t pmd)
+{
+	if (PagePinned(pmd)) {
+		unsigned long pfn = page_to_pfn(pmd);
+
+		if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
+						 pfn_pte(pfn, PAGE_KERNEL),
+						 0))
+			BUG();
+		ClearPagePinned(pmd);
+	}
+
+	ClearPageForeign(pmd);
+	init_page_count(pmd);
+	__free_page(pmd);
+}
+
+void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+	tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+#if PAGETABLE_LEVELS > 3
+void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+	tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif	/* PAGETABLE_LEVELS > 3 */
+#endif	/* PAGETABLE_LEVELS > 2 */
+
+static void _pin_lock(struct mm_struct *mm, int lock) {
+	if (lock)
+		spin_lock(&mm->page_table_lock);
+#if USE_SPLIT_PTLOCKS
+	/* While mm->page_table_lock protects us against insertions and
+	 * removals of higher level page table pages, it doesn't protect
+	 * against updates of pte-s. Such updates, however, require the
+	 * pte pages to be in consistent state (unpinned+writable or
+	 * pinned+readonly). The pinning and attribute changes, however
+	 * cannot be done atomically, which is why such updates must be
+	 * prevented from happening concurrently.
+	 * Note that no pte lock can ever elsewhere be acquired nesting
+	 * with an already acquired one in the same mm, or with the mm's
+	 * page_table_lock already acquired, as that would break in the
+	 * non-split case (where all these are actually resolving to the
+	 * one page_table_lock). Thus acquiring all of them here is not
+	 * going to result in dead locks, and the order of acquires
+	 * doesn't matter.
+	 */
+	{
+		pgd_t *pgd = mm->pgd;
+		unsigned g;
+
+		for (g = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
+			pud_t *pud;
+			unsigned u;
+
+			if (pgd_none(*pgd))
+				continue;
+			pud = pud_offset(pgd, 0);
+			for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+				pmd_t *pmd;
+				unsigned m;
+
+				if (pud_none(*pud))
+					continue;
+				pmd = pmd_offset(pud, 0);
+				for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+					spinlock_t *ptl;
+
+					if (pmd_none(*pmd))
+						continue;
+					ptl = pte_lockptr(0, pmd);
+					if (lock)
+						spin_lock(ptl);
+					else
+						spin_unlock(ptl);
+				}
+			}
+		}
+	}
+#endif
+	if (!lock)
+		spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
+#define PIN_BATCH sizeof(void *)
+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
+
+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
+					     unsigned int cpu, unsigned int seq)
+{
+	unsigned long pfn = page_to_pfn(page);
+
+	if (pgprot_val(flags) & _PAGE_RW)
+		ClearPagePinned(page);
+	else
+		SetPagePinned(page);
+	if (PageHighMem(page))
+		return seq;
+	MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+				(unsigned long)__va(pfn << PAGE_SHIFT),
+				pfn_pte(pfn, flags), 0);
+	if (unlikely(++seq == PIN_BATCH)) {
+		if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+							PIN_BATCH, NULL)))
+			BUG();
+		seq = 0;
+	}
+
+	return seq;
+}
+
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+{
+	pgd_t       *pgd = pgd_base;
+	pud_t       *pud;
+	pmd_t       *pmd;
+	int          g,u,m;
+	unsigned int cpu, seq;
+	multicall_entry_t *mcl;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	cpu = get_cpu();
+
+	/*
+	 * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
+	 * may not be the 'current' task's pagetables (e.g., current may be
+	 * 32-bit, but the pagetables may be for a 64-bit task).
+	 * Subtracting 1 from TASK_SIZE_MAX means the loop limit is correct
+	 * regardless of whether TASK_SIZE_MAX is a multiple of PGDIR_SIZE.
+	 */
+	for (g = 0, seq = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
+		if (pgd_none(*pgd))
+			continue;
+		pud = pud_offset(pgd, 0);
+		if (PTRS_PER_PUD > 1) /* not folded */
+			seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
+		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+			if (pud_none(*pud))
+				continue;
+			pmd = pmd_offset(pud, 0);
+			if (PTRS_PER_PMD > 1) /* not folded */
+				seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
+			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+				if (pmd_none(*pmd))
+					continue;
+				seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
+			}
+		}
+	}
+
+#ifdef CONFIG_X86_PAE
+	for (; g < PTRS_PER_PGD; g++, pgd++) {
+		BUG_ON(pgd_none(*pgd));
+		pud = pud_offset(pgd, 0);
+		BUG_ON(pud_none(*pud));
+		pmd = pmd_offset(pud, 0);
+		seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
+	}
+#endif
+
+	mcl = per_cpu(pb_mcl, cpu);
+#ifdef CONFIG_X86_64
+	if (unlikely(seq > PIN_BATCH - 2)) {
+		if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
+			BUG();
+		seq = 0;
+	}
+	pgd = __user_pgd(pgd_base);
+	BUG_ON(!pgd);
+	MULTI_update_va_mapping(mcl + seq,
+	       (unsigned long)pgd,
+	       pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, flags),
+	       0);
+	MULTI_update_va_mapping(mcl + seq + 1,
+	       (unsigned long)pgd_base,
+	       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+	       UVMF_TLB_FLUSH);
+	if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
+		BUG();
+#else
+	if (likely(seq != 0)) {
+		MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+			(unsigned long)pgd_base,
+			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+			UVMF_TLB_FLUSH);
+		if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+		                                        seq + 1, NULL)))
+			BUG();
+	} else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+			pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+			UVMF_TLB_FLUSH))
+		BUG();
+#endif
+
+	put_cpu();
+}
+
+void __init xen_init_pgd_pin(void)
+{
+	pgd_t       *pgd = init_mm.pgd;
+	pud_t       *pud;
+	pmd_t       *pmd;
+	unsigned int g, u, m;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return;
+
+	SetPagePinned(virt_to_page(pgd));
+	for (g = 0; g < PTRS_PER_PGD; g++, pgd++) {
+#ifndef CONFIG_X86_PAE
+		if (g >= pgd_index(HYPERVISOR_VIRT_START)
+		    && g <= pgd_index(HYPERVISOR_VIRT_END - 1))
+			continue;
+#endif
+		if (!pgd_present(*pgd))
+			continue;
+		pud = pud_offset(pgd, 0);
+		if (PTRS_PER_PUD > 1) /* not folded */
+			SetPagePinned(virt_to_page(pud));
+		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+			if (!pud_present(*pud) || pud_large(*pud))
+				continue;
+			pmd = pmd_offset(pud, 0);
+			if (PTRS_PER_PMD > 1) /* not folded */
+				SetPagePinned(virt_to_page(pmd));
+			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+#ifdef CONFIG_X86_PAE
+				if (g == pgd_index(HYPERVISOR_VIRT_START)
+				    && m >= pmd_index(HYPERVISOR_VIRT_START))
+					continue;
+#endif
+				if (!pmd_present(*pmd) || pmd_large(*pmd))
+					continue;
+				SetPagePinned(pmd_page(*pmd));
+			}
+		}
+	}
+#ifdef CONFIG_X86_64
+	SetPagePinned(virt_to_page(level3_user_pgt));
+#endif
+}
+
+static void __pgd_pin(pgd_t *pgd)
+{
+	pgd_walk(pgd, PAGE_KERNEL_RO);
+	kmap_flush_unused();
+	xen_pgd_pin(pgd);
+	SetPagePinned(virt_to_page(pgd));
+}
+
+static void __pgd_unpin(pgd_t *pgd)
+{
+	xen_pgd_unpin(pgd);
+	pgd_walk(pgd, PAGE_KERNEL);
+	ClearPagePinned(virt_to_page(pgd));
+}
+
+static void pgd_test_and_unpin(pgd_t *pgd)
+{
+	if (PagePinned(virt_to_page(pgd)))
+		__pgd_unpin(pgd);
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+	if (xen_feature(XENFEAT_writable_page_tables))
+		return;
+
+	pin_lock(mm);
+	__pgd_pin(mm->pgd);
+	pin_unlock(mm);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+	if (xen_feature(XENFEAT_writable_page_tables))
+		return;
+
+	pin_lock(mm);
+	__pgd_unpin(mm->pgd);
+	pin_unlock(mm);
+}
+
+void mm_pin_all(void)
+{
+	struct page *page;
+
+	if (xen_feature(XENFEAT_writable_page_tables))
+		return;
+
+	/*
+	 * Allow uninterrupted access to the pgd_list. Also protects
+	 * __pgd_pin() by ensuring preemption is disabled.
+	 * All other CPUs must be at a safe point (e.g., in stop_machine
+	 * or offlined entirely).
+	 */
+	BUG_ON(!irqs_disabled());
+	spin_lock(&pgd_lock);
+	list_for_each_entry(page, &pgd_list, lru) {
+		if (!PagePinned(page))
+			__pgd_pin((pgd_t *)page_address(page));
+	}
+	spin_unlock(&pgd_lock);
+}
+
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	if (!PagePinned(virt_to_page(mm->pgd)))
+		mm_pin(mm);
+}
+
+/*
+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() *much*
+ * faster this way, as no hypercalls are needed for the page table updates.
+ */
+static void leave_active_mm(struct task_struct *tsk, struct mm_struct *mm)
+	__releases(tsk->alloc_lock)
+{
+	if (tsk->active_mm == mm) {
+		tsk->active_mm = &init_mm;
+		atomic_inc(&init_mm.mm_count);
+
+		switch_mm(mm, &init_mm, tsk);
+
+		if (atomic_dec_and_test(&mm->mm_count))
+			BUG();
+	}
+
+	task_unlock(tsk);
+}
+
+static void _leave_active_mm(void *mm)
+{
+	struct task_struct *tsk = current;
+
+	if (spin_trylock(&tsk->alloc_lock))
+		leave_active_mm(tsk, mm);
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+	struct task_struct *tsk = current;
+
+	task_lock(tsk);
+	leave_active_mm(tsk, mm);
+
+	preempt_disable();
+	smp_call_function_many(mm_cpumask(mm), _leave_active_mm, mm, 1);
+	preempt_enable();
+
+	if (PagePinned(virt_to_page(mm->pgd))
+	    && atomic_read(&mm->mm_count) == 1
+	    && !mm->context.has_foreign_mappings)
+		mm_unpin(mm);
+}
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+
+	list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+
+	list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD				\
+	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
+{
+	pgd_test_and_unpin(pgd);
+
+	/* If the pgd points to a shared pagetable level (either the
+	   ptes in non-PAE, or shared PMD in PAE), then just copy the
+	   references from swapper_pg_dir. */
+	if (PAGETABLE_LEVELS == 2 ||
+	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+	    PAGETABLE_LEVELS == 4) {
+		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+				KERNEL_PGD_PTRS);
+	}
+
+#ifdef CONFIG_X86_64
+	/* set level3_user_pgt for vsyscall area */
+	__user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
+		__pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
+#endif
+
+	/* list required to sync kernel mapping updates */
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
+		pgd_list_add(pgd);
+	}
+}
+
+static void pgd_dtor(pgd_t *pgd)
+{
+	if (!SHARED_KERNEL_PMD) {
+		spin_lock(&pgd_lock);
+		pgd_list_del(pgd);
+		spin_unlock(&pgd_lock);
+	}
+
+	pgd_test_and_unpin(pgd);
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+
+#ifdef CONFIG_X86_PAE
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+#define PREALLOCATED_PMDS	UNSHARED_PTRS_PER_PGD
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+	/* Note: almost everything apart from _PAGE_PRESENT is
+	   reserved at the pmd (PDPT) level. */
+	pud_t pud = __pud(__pa(pmd) | _PAGE_PRESENT);
+
+	paravirt_alloc_pmd(mm, page_to_pfn(virt_to_page(pmd)));
+
+	if (likely(!PagePinned(virt_to_page(pudp)))) {
+		*pudp = pud;
+		return;
+	}
+
+	set_pud(pudp, pud);
+
+	/*
+	 * According to Intel App note "TLBs, Paging-Structure Caches,
+	 * and Their Invalidation", April 2007, document 317080-001,
+	 * section 8.1: in PAE mode we explicitly have to flush the
+	 * TLB via cr3 if the top-level pgd is changed...
+	 */
+	flush_tlb_mm(mm);
+}
+#else  /* !CONFIG_X86_PAE */
+
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+#define PREALLOCATED_PMDS	0
+
+#endif	/* CONFIG_X86_PAE */
+
+static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
+{
+	int i;
+
+#ifdef CONFIG_X86_PAE
+	if (contig)
+		xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
+#endif
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++)
+		if (pmds[i])
+			pmd_free(mm, pmds[i]);
+}
+
+static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
+{
+	int i;
+	bool failed = false;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
+		if (pmd == NULL)
+			failed = true;
+		pmds[i] = pmd;
+	}
+
+	if (failed) {
+		free_pmds(pmds, mm, false);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+	int i;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pgd_t pgd = pgdp[i];
+
+		if (__pgd_val(pgd) != 0) {
+			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+			pgdp[i] = xen_make_pgd(0);
+
+			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+			pmd_free(mm, pmd);
+		}
+	}
+
+#ifdef CONFIG_X86_PAE
+	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
+		xen_destroy_contiguous_region((unsigned long)pgdp, 0);
+#endif
+}
+
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
+{
+	pud_t *pud;
+	unsigned long addr;
+	int i;
+
+	if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+		return;
+
+	pud = pud_offset(pgd, 0);
+ 	for (addr = i = 0; i < PREALLOCATED_PMDS;
+	     i++, pud++, addr += PUD_SIZE) {
+		pmd_t *pmd = pmds[i];
+
+		if (i >= KERNEL_PGD_BOUNDARY)
+			memcpy(pmd,
+			       (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+			       sizeof(pmd_t) * PTRS_PER_PMD);
+
+		/* It is safe to poke machine addresses of pmds under the pgd_lock. */
+		pud_populate(mm, pud, pmd);
+	}
+}
+
+static inline pgd_t *user_pgd_alloc(pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+	if (pgd) {
+		pgd_t *upgd = (void *)__get_free_page(PGALLOC_GFP);
+
+		if (upgd)
+			set_page_private(virt_to_page(pgd),
+					 (unsigned long)upgd);
+		else {
+			free_page((unsigned long)pgd);
+			pgd = NULL;
+		}
+	}
+#endif
+	return pgd;
+}
+
+static inline void user_pgd_free(pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+	free_page(page_private(virt_to_page(pgd)));
+#endif
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *pgd;
+	pmd_t *pmds[PREALLOCATED_PMDS];
+
+	pgd = user_pgd_alloc((void *)__get_free_page(PGALLOC_GFP));
+
+	if (pgd == NULL)
+		goto out;
+
+	mm->pgd = pgd;
+
+	if (preallocate_pmds(pmds, mm) != 0)
+		goto out_free_pgd;
+
+	if (paravirt_pgd_alloc(mm) != 0)
+		goto out_free_pmds;
+
+	/*
+	 * Make sure that pre-populating the pmds is atomic with
+	 * respect to anything walking the pgd_list, so that they
+	 * never see a partially populated pgd.
+	 */
+	spin_lock(&pgd_lock);
+
+#ifdef CONFIG_X86_PAE
+	/* Protect against save/restore: move below 4GB under pgd_lock. */
+	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
+	    && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
+		spin_unlock(&pgd_lock);
+		goto out_free_pmds;
+	}
+#endif
+
+	pgd_ctor(mm, pgd);
+	pgd_prepopulate_pmd(mm, pgd, pmds);
+
+	spin_unlock(&pgd_lock);
+
+	return pgd;
+
+out_free_pmds:
+	free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
+out_free_pgd:
+	user_pgd_free(pgd);
+	free_page((unsigned long)pgd);
+out:
+	return NULL;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	/*
+	 * After this the pgd should not be pinned for the duration of this
+	 * function's execution. We should never sleep and thus never race:
+	 *  1. User pmds will not become write-protected under our feet due
+	 *     to a concurrent mm_pin_all().
+	 *  2. The machine addresses in PGD entries will not become invalid
+	 *     due to a concurrent save/restore.
+	 */
+	pgd_dtor(pgd);
+
+	pgd_mop_up_pmds(mm, pgd);
+	paravirt_pgd_free(mm, pgd);
+	user_pgd_free(pgd);
+	free_page((unsigned long)pgd);
+}
+
+/* blktap and gntdev need this, as otherwise they would implicitly (and
+ * needlessly, as they never use it) reference init_mm. */
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
+				  unsigned long addr, pte_t *ptep, int full)
+{
+	return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm,
+				       addr, ptep, full);
+}
+EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
+
+int ptep_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pte_t *ptep,
+			  pte_t entry, int dirty)
+{
+	int changed = !pte_same(*ptep, entry);
+
+	if (changed && dirty) {
+		if (likely(vma->vm_mm == current->mm)) {
+			if (HYPERVISOR_update_va_mapping(address,
+				entry,
+				uvm_multi(mm_cpumask(vma->vm_mm))|UVMF_INVLPG))
+				BUG();
+		} else {
+			xen_l1_entry_update(ptep, entry);
+			flush_tlb_page(vma, address);
+		}
+	}
+
+	return changed;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pmd_t *pmdp,
+			  pmd_t entry, int dirty)
+{
+	int changed = !pmd_same(*pmdp, entry);
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	if (changed && dirty) {
+		*pmdp = entry;
+		pmd_update_defer(vma->vm_mm, address, pmdp);
+		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	}
+
+	return changed;
+}
+#endif
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pte_t *ptep)
+{
+	int ret = 0;
+
+	if (pte_young(*ptep))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 (unsigned long *) &ptep->pte);
+
+	if (ret)
+		pte_update(vma->vm_mm, addr, ptep);
+
+	return ret;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pmd_t *pmdp)
+{
+	int ret = 0;
+
+	if (pmd_young(*pmdp))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 (unsigned long *)pmdp);
+
+	if (ret)
+		pmd_update(vma->vm_mm, addr, pmdp);
+
+	return ret;
+}
+#endif
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+			   unsigned long address, pte_t *ptep)
+{
+	pte_t pte = *ptep;
+	int young = pte_young(pte);
+
+	pte = pte_mkold(pte);
+	if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
+		ptep_set_access_flags(vma, address, ptep, pte, young);
+	else if (young)
+		ptep->pte_low = pte.pte_low;
+
+	return young;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+			   unsigned long address, pmd_t *pmdp)
+{
+	int young;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	young = pmdp_test_and_clear_young(vma, address, pmdp);
+	if (young)
+		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+	return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+			  unsigned long address, pmd_t *pmdp)
+{
+	int set;
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+				(unsigned long *)pmdp);
+	if (set) {
+		pmd_update(vma->vm_mm, address, pmdp);
+		/* need tlb flush only to serialize against gup-fast */
+		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	}
+}
+#endif
+
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+#ifdef CONFIG_X86_32
+	BUG_ON(fixmaps_set > 0);
+	printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+	       (int)-reserve);
+	__FIXADDR_TOP = -reserve - PAGE_SIZE;
+#endif
+}
+
+int fixmaps_set;
+
+void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+{
+	unsigned long address = __fix_to_virt(idx);
+	pte_t pte;
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+
+	switch (idx) {
+#ifdef CONFIG_X86_64
+	extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+
+	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+	case VVAR_PAGE:
+		pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+		set_pte_vaddr_pud(level3_user_pgt, address, pte);
+		break;
+	case FIX_EARLYCON_MEM_BASE:
+	case FIX_SHARED_INFO:
+	case FIX_ISAMAP_END ... FIX_ISAMAP_BEGIN:
+		xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
+				    pfn_pte_ma(phys >> PAGE_SHIFT, flags));
+		fixmaps_set++;
+		return;
+#else
+	case FIX_WP_TEST:
+	case FIX_VDSO:
+		pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+		break;
+#endif
+	default:
+		pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
+		break;
+	}
+	set_pte_vaddr(address, pte);
+	fixmaps_set++;
+}
diff --git a/arch/x86/mm/pgtable_32-xen.c b/arch/x86/mm/pgtable_32-xen.c
new file mode 100644
index 0000000..1adb777
--- /dev/null
+++ b/arch/x86/mm/pgtable_32-xen.c
@@ -0,0 +1,179 @@
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+
+#include <xen/features.h>
+#include <asm/hypervisor.h>
+
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+#ifndef CONFIG_XEN
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	if (pgd_none(*pgd)) {
+		BUG();
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		BUG();
+		return;
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		BUG();
+		return;
+	}
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_val(pteval))
+		set_pte_at(&init_mm, vaddr, pte, pteval);
+	else
+		pte_clear(&init_mm, vaddr, pte);
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+#else
+	if (HYPERVISOR_update_va_mapping(vaddr, pteval,
+					 UVMF_INVLPG|UVMF_ALL))
+		BUG();
+#endif
+}
+
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	if (vaddr & (PMD_SIZE-1)) {		/* vaddr is misaligned */
+		printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
+		return; /* BUG(); */
+	}
+	if (pfn & (PTRS_PER_PTE-1)) {		/* pfn is misaligned */
+		printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
+		return; /* BUG(); */
+	}
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	if (pgd_none(*pgd)) {
+		printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
+		return; /* BUG(); */
+	}
+	pud = pud_offset(pgd, vaddr);
+	pmd = pmd_offset(pud, vaddr);
+	set_pmd(pmd, pfn_pmd(pfn, flags));
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
+unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	/* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
+	__VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
+	return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+#ifndef CONFIG_XEN
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+	unsigned long address;
+
+	if (!arg)
+		return -EINVAL;
+
+	address = memparse(arg, &arg);
+	reserve_top_address(address);
+	fixup_early_ioremap();
+	return 0;
+}
+early_param("reservetop", parse_reservetop);
+#endif
+
+void make_lowmem_page_readonly(void *va, unsigned int feature)
+{
+	pte_t *pte;
+	unsigned int level;
+	int rc;
+
+	if (xen_feature(feature))
+		return;
+
+	pte = lookup_address((unsigned long)va, &level);
+	BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
+	rc = HYPERVISOR_update_va_mapping(
+		(unsigned long)va, pte_wrprotect(*pte), 0);
+	BUG_ON(rc);
+}
+
+void make_lowmem_page_writable(void *va, unsigned int feature)
+{
+	pte_t *pte;
+	unsigned int level;
+	int rc;
+
+	if (xen_feature(feature))
+		return;
+
+	pte = lookup_address((unsigned long)va, &level);
+	BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
+	rc = HYPERVISOR_update_va_mapping(
+		(unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
+	BUG_ON(rc);
+}
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index d2e2735..fdb369a 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -8,6 +8,10 @@
 
 #ifdef CONFIG_X86_64
 
+#ifdef CONFIG_XEN
+#define phys_base 0
+#endif
+
 unsigned long __phys_addr(unsigned long x)
 {
 	if (x >= __START_KERNEL_map) {
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
index 1599f56..eaf5382 100644
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -6,6 +6,13 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
 		oprofilefs.o oprofile_stats.o  \
 		timer_int.o nmi_timer_int.o )
 
+ifdef CONFIG_XEN
+XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
+			 xenoprofile.o)
+oprofile-y				:= $(DRIVER_OBJS) \
+					   $(XENOPROF_COMMON_OBJS) xenoprof.o
+else
 oprofile-y				:= $(DRIVER_OBJS) init.o backtrace.o
 oprofile-$(CONFIG_X86_LOCAL_APIC) 	+= nmi_int.o op_model_amd.o \
 					   op_model_ppro.o op_model_p4.o
+endif
diff --git a/arch/x86/oprofile/xenoprof.c b/arch/x86/oprofile/xenoprof.c
new file mode 100644
index 0000000..dde0310
--- /dev/null
+++ b/arch/x86/oprofile/xenoprof.c
@@ -0,0 +1,179 @@
+/**
+ * @file xenoprof.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon and Jose Renato Santos for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
+ * x86-specific part
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include <linux/init.h>
+#include <linux/oprofile.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <asm/pgtable.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/xenoprof.h>
+#include <xen/xenoprof.h>
+#include "op_counter.h"
+
+static unsigned int num_events = 0;
+
+void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
+{
+	num_events = init->num_events;
+	/* just in case - make sure we do not overflow event list 
+	   (i.e. counter_config list) */
+	if (num_events > OP_MAX_COUNTER) {
+		num_events = OP_MAX_COUNTER;
+		init->num_events = num_events;
+	}
+}
+
+void xenoprof_arch_counter(void)
+{
+	int i;
+	struct xenoprof_counter counter;
+
+	for (i=0; i<num_events; i++) {
+		counter.ind       = i;
+		counter.count     = (uint64_t)counter_config[i].count;
+		counter.enabled   = (uint32_t)counter_config[i].enabled;
+		counter.event     = (uint32_t)counter_config[i].event;
+		counter.kernel    = (uint32_t)counter_config[i].kernel;
+		counter.user      = (uint32_t)counter_config[i].user;
+		counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
+		WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter,
+					       &counter));
+	}
+}
+
+void xenoprof_arch_start(void) 
+{
+	/* nothing */
+}
+
+void xenoprof_arch_stop(void)
+{
+	/* nothing */
+}
+
+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
+{
+	if (sbuf->buffer) {
+		vunmap(sbuf->buffer);
+		sbuf->buffer = NULL;
+	}
+}
+
+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
+				    struct xenoprof_shared_buffer * sbuf)
+{
+	int npages, ret;
+	struct vm_struct *area;
+
+	sbuf->buffer = NULL;
+	if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
+		return ret;
+
+	npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
+
+	area = alloc_vm_area(npages * PAGE_SIZE, NULL);
+	if (area == NULL)
+		return -ENOMEM;
+
+	if ( (ret = direct_kernel_remap_pfn_range(
+		      (unsigned long)area->addr,
+		      get_buffer->buf_gmaddr >> PAGE_SHIFT,
+		      npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
+		      DOMID_SELF)) ) {
+		vunmap(area->addr);
+		return ret;
+	}
+
+	sbuf->buffer = area->addr;
+	return ret;
+}
+
+int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
+			      struct xenoprof_shared_buffer * sbuf)
+{
+	int ret;
+	int npages;
+	struct vm_struct *area;
+	pgprot_t prot = __pgprot(_KERNPG_TABLE);
+
+	sbuf->buffer = NULL;
+	ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
+	if (ret)
+		goto out;
+
+	npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
+
+	area = alloc_vm_area(npages * PAGE_SIZE, NULL);
+	if (area == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = direct_kernel_remap_pfn_range(
+		(unsigned long)area->addr,
+		pdomain->buf_gmaddr >> PAGE_SHIFT,
+		npages * PAGE_SIZE, prot, DOMID_SELF);
+	if (ret) {
+		vunmap(area->addr);
+		goto out;
+	}
+	sbuf->buffer = area->addr;
+
+out:
+	return ret;
+}
+
+struct op_counter_config counter_config[OP_MAX_COUNTER];
+
+int xenoprof_create_files(struct super_block * sb, struct dentry * root)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_events; ++i) {
+		struct dentry * dir;
+		char buf[2];
+ 
+		snprintf(buf, 2, "%d", i);
+		dir = oprofilefs_mkdir(sb, root, buf);
+		oprofilefs_create_ulong(sb, dir, "enabled",
+					&counter_config[i].enabled);
+		oprofilefs_create_ulong(sb, dir, "event",
+					&counter_config[i].event);
+		oprofilefs_create_ulong(sb, dir, "count",
+					&counter_config[i].count);
+		oprofilefs_create_ulong(sb, dir, "unit_mask",
+					&counter_config[i].unit_mask);
+		oprofilefs_create_ulong(sb, dir, "kernel",
+					&counter_config[i].kernel);
+		oprofilefs_create_ulong(sb, dir, "user",
+					&counter_config[i].user);
+	}
+
+	return 0;
+}
+
+int __init oprofile_arch_init(struct oprofile_operations * ops)
+{
+	return xenoprofile_init(ops);
+}
+
+void oprofile_arch_exit(void)
+{
+	xenoprofile_exit();
+}
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index e76e18c..14d087d 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -5,6 +5,9 @@ obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig_$(BITS).o direct.o mmconfig-shared.o
 obj-$(CONFIG_PCI_DIRECT)	+= direct.o
 obj-$(CONFIG_PCI_OLPC)		+= olpc.o
 obj-$(CONFIG_PCI_XEN)		+= xen.o
+# pcifront should be after mmconfig.o and direct.o as it should only
+# take over if direct access to the PCI bus is unavailable
+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o
 
 obj-y				+= fixup.o
 obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 0567df3..a9f0c26 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -330,6 +330,7 @@ static int __init early_fill_mp_bus_info(void)
 
 #define ENABLE_CF8_EXT_CFG      (1ULL << 46)
 
+#ifndef CONFIG_XEN
 static void __cpuinit enable_pci_io_ecs(void *unused)
 {
 	u64 reg;
@@ -358,6 +359,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self,
 static struct notifier_block __cpuinitdata amd_cpu_notifier = {
 	.notifier_call	= amd_cpu_notify,
 };
+#endif /* CONFIG_XEN */
 
 static void __init pci_enable_pci_io_ecs(void)
 {
@@ -398,10 +400,19 @@ static int __init pci_io_ecs_init(void)
 	if (early_pci_allowed())
 		pci_enable_pci_io_ecs();
 
+#ifndef CONFIG_XEN
 	register_cpu_notifier(&amd_cpu_notifier);
 	for_each_online_cpu(cpu)
 		amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
 			       (void *)(long)cpu);
+#else
+	if (cpu = 1, cpu) {
+		u64 reg;
+		rdmsrl(MSR_AMD64_NB_CFG, reg);
+		if (!(reg & ENABLE_CF8_EXT_CFG))
+			return 0;
+	}
+#endif
 	pci_probe |= PCI_HAS_IO_ECS;
 
 	return 0;
@@ -409,6 +420,10 @@ static int __init pci_io_ecs_init(void)
 
 static int __init amd_postcore_init(void)
 {
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return 0;
+#endif
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		return 0;
 
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 91821a1..caaa8c8 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -240,12 +240,14 @@ void __init pcibios_resource_survey(void)
 	pcibios_allocate_resources(1);
 
 	e820_reserve_resources_late();
+#ifndef CONFIG_XEN
 	/*
 	 * Insert the IO APIC resources after PCI initialization has
 	 * occurred to handle IO APICS that are mapped in on a BAR in
 	 * PCI space, but before trying to assign unassigned pci res.
 	 */
 	ioapic_insert_resources();
+#endif
 }
 
 /**
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 372e9b8..c7a3f6a 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -94,13 +94,18 @@ static struct irq_routing_table * __init pirq_find_routing_table(void)
 	u8 *addr;
 	struct irq_routing_table *rt;
 
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return NULL;
+#endif
 	if (pirq_table_addr) {
-		rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr));
+		rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
 		if (rt)
 			return rt;
 		printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
 	}
-	for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
+	for (addr = (u8 *) isa_bus_to_virt(0xf0000);
+	     addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
 		rt = pirq_check_routing_table(addr);
 		if (rt)
 			return rt;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 301e325..b919e48 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -21,6 +21,10 @@
 #include <asm/pci_x86.h>
 #include <asm/acpi.h>
 
+#ifdef CONFIG_XEN
+#include <xen/interface/physdev.h>
+#endif
+
 #define PREFIX "PCI: "
 
 /* Indicate if the mmcfg resources have been placed into the resource table. */
@@ -471,6 +475,31 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
 		}
 	}
 
+#ifdef CONFIG_XEN
+	if (!with_e820)	{
+		struct physdev_pci_mmcfg_reserved r = {
+			.address = cfg->address,
+			.segment = cfg->segment,
+			.start_bus = cfg->start_bus,
+			.end_bus = cfg->end_bus,
+			.flags = valid ? XEN_PCI_MMCFG_RESERVED : 0
+		};
+		int rc;
+
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
+		switch (rc) {
+		case 0: case -ENOSYS:
+			break;
+		default:
+			pr_warn(PREFIX "Failed to report MMCONFIG reservation"
+				" state for %04x [bus%02x-%02x] to hypervisor"
+				" (%d)\n",
+				cfg->segment, cfg->start_bus, cfg->end_bus,
+				rc);
+		}
+	}
+#endif
+
 	return valid;
 }
 
diff --git a/arch/x86/pci/pcifront.c b/arch/x86/pci/pcifront.c
new file mode 100644
index 0000000..5d08be1
--- /dev/null
+++ b/arch/x86/pci/pcifront.c
@@ -0,0 +1,59 @@
+/*
+ * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
+ *                     to support the Xen PCI Frontend's operation
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <asm/acpi.h>
+#include <asm/pci_x86.h>
+#include <xen/evtchn.h>
+
+static int pcifront_enable_irq(struct pci_dev *dev)
+{
+	u8 irq;
+	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
+	if (!alloc_irq_and_cfg_at(irq, numa_node_id()))
+		return -ENOMEM;
+	evtchn_register_pirq(irq);
+	dev->irq = irq;
+
+	return 0;
+}
+
+extern u8 pci_cache_line_size;
+
+static int __init pcifront_x86_stub_init(void)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	/* Only install our method if we haven't found real hardware already */
+	if (raw_pci_ops)
+		return 0;
+
+	pr_info("PCI: setting up Xen PCI frontend stub\n");
+
+	/* Copied from arch/i386/pci/common.c */
+	pci_cache_line_size = 32 >> 2;
+	if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
+		pci_cache_line_size = 64 >> 2;	/* K7 & K8 */
+	else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
+		pci_cache_line_size = 128 >> 2;	/* P4 */
+
+	/* On x86, we need to disable the normal IRQ routing table and
+	 * just ask the backend
+	 */
+	pcibios_enable_irq = pcifront_enable_irq;
+	pcibios_disable_irq = NULL;
+
+#ifdef CONFIG_ACPI
+	/* Keep ACPI out of the picture */
+	acpi_noirq = 1;
+#endif
+
+	return 0;
+}
+
+arch_initcall(pcifront_x86_stub_init);
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index 73b8be0..088a961 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o
+disabled-obj-$(CONFIG_XEN)	:= efi_%$(BITS).o
diff --git a/arch/x86/platform/efi/efi-xen.c b/arch/x86/platform/efi/efi-xen.c
new file mode 100644
index 0000000..a70aaf8
--- /dev/null
+++ b/arch/x86/platform/efi/efi-xen.c
@@ -0,0 +1,507 @@
+/*
+ * Common EFI (Extensible Firmware Interface) support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2005-2008 Intel Co.
+ *	Fenghua Yu <fenghua.yu@intel.com>
+ *	Bibo Mao <bibo.mao@intel.com>
+ *	Chandramouli Narayanan <mouli@linux.intel.com>
+ *	Huang Ying <ying.huang@intel.com>
+ *
+ * Copied from efi_32.c to eliminate the duplicated code between EFI
+ * 32/64 support code. --ying 2007-10-26
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *	Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/efi.h>
+#include <linux/export.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/time.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
+
+#include <xen/interface/platform.h>
+
+#define EFI_DEBUG	1
+#define PFX 		"EFI: "
+
+int __read_mostly efi_enabled;
+EXPORT_SYMBOL(efi_enabled);
+
+#define call op.u.efi_runtime_call
+#define DECLARE_CALL(what) \
+	struct xen_platform_op op; \
+	op.cmd = XENPF_efi_runtime_call; \
+	call.function = XEN_EFI_##what; \
+	call.misc = 0
+
+static efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+	int err;
+	DECLARE_CALL(get_time);
+
+	err = HYPERVISOR_platform_op(&op);
+	if (err)
+		return EFI_UNSUPPORTED;
+
+	if (tm) {
+		BUILD_BUG_ON(sizeof(*tm) != sizeof(call.u.get_time.time));
+		memcpy(tm, &call.u.get_time.time, sizeof(*tm));
+	}
+
+	if (tc) {
+		tc->resolution = call.u.get_time.resolution;
+		tc->accuracy = call.u.get_time.accuracy;
+		tc->sets_to_zero = !!(call.misc &
+				      XEN_EFI_GET_TIME_SET_CLEARS_NS);
+	}
+
+	return call.status;
+}
+
+static efi_status_t xen_efi_set_time(efi_time_t *tm)
+{
+	DECLARE_CALL(set_time);
+
+	BUILD_BUG_ON(sizeof(*tm) != sizeof(call.u.set_time));
+	memcpy(&call.u.set_time, tm, sizeof(*tm));
+
+	return HYPERVISOR_platform_op(&op) ? EFI_UNSUPPORTED : call.status;
+}
+
+static efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled,
+					    efi_bool_t *pending,
+					    efi_time_t *tm)
+{
+	int err;
+	DECLARE_CALL(get_wakeup_time);
+
+	err = HYPERVISOR_platform_op(&op);
+	if (err)
+		return EFI_UNSUPPORTED;
+
+	if (tm) {
+		BUILD_BUG_ON(sizeof(*tm) != sizeof(call.u.get_wakeup_time));
+		memcpy(tm, &call.u.get_wakeup_time, sizeof(*tm));
+	}
+
+	if (enabled)
+		*enabled = !!(call.misc & XEN_EFI_GET_WAKEUP_TIME_ENABLED);
+
+	if (pending)
+		*pending = !!(call.misc & XEN_EFI_GET_WAKEUP_TIME_PENDING);
+
+	return call.status;
+}
+
+static efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
+{
+	DECLARE_CALL(set_wakeup_time);
+
+	BUILD_BUG_ON(sizeof(*tm) != sizeof(call.u.set_wakeup_time));
+	if (enabled)
+		call.misc = XEN_EFI_SET_WAKEUP_TIME_ENABLE;
+	if (tm)
+		memcpy(&call.u.set_wakeup_time, tm, sizeof(*tm));
+	else
+		call.misc |= XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY;
+
+	return HYPERVISOR_platform_op(&op) ? EFI_UNSUPPORTED : call.status;
+}
+
+static efi_status_t xen_efi_get_variable(efi_char16_t *name,
+					 efi_guid_t *vendor,
+					 u32 *attr,
+					 unsigned long *data_size,
+					 void *data)
+{
+	int err;
+	DECLARE_CALL(get_variable);
+
+	set_xen_guest_handle(call.u.get_variable.name, name);
+	BUILD_BUG_ON(sizeof(*vendor) !=
+		     sizeof(call.u.get_variable.vendor_guid));
+	memcpy(&call.u.get_variable.vendor_guid, vendor, sizeof(*vendor));
+	call.u.get_variable.size = *data_size;
+	set_xen_guest_handle(call.u.get_variable.data, data);
+	err = HYPERVISOR_platform_op(&op);
+	if (err)
+		return EFI_UNSUPPORTED;
+
+	*data_size = call.u.get_variable.size;
+	*attr = call.misc;
+
+	return call.status;
+}
+
+static efi_status_t xen_efi_get_next_variable(unsigned long *name_size,
+					      efi_char16_t *name,
+					      efi_guid_t *vendor)
+{
+	int err;
+	DECLARE_CALL(get_next_variable_name);
+
+	call.u.get_next_variable_name.size = *name_size;
+	set_xen_guest_handle(call.u.get_next_variable_name.name, name);
+	BUILD_BUG_ON(sizeof(*vendor) !=
+		     sizeof(call.u.get_next_variable_name.vendor_guid));
+	memcpy(&call.u.get_next_variable_name.vendor_guid, vendor,
+	       sizeof(*vendor));
+	err = HYPERVISOR_platform_op(&op);
+	if (err)
+		return EFI_UNSUPPORTED;
+
+	*name_size = call.u.get_next_variable_name.size;
+	memcpy(vendor, &call.u.get_next_variable_name.vendor_guid,
+	       sizeof(*vendor));
+
+	return call.status;
+}
+
+static efi_status_t xen_efi_set_variable(efi_char16_t *name,
+					 efi_guid_t *vendor,
+					 u32 attr,
+					 unsigned long data_size,
+					 void *data)
+{
+	DECLARE_CALL(set_variable);
+
+	set_xen_guest_handle(call.u.set_variable.name, name);
+	call.misc = attr;
+	BUILD_BUG_ON(sizeof(*vendor) !=
+		     sizeof(call.u.set_variable.vendor_guid));
+	memcpy(&call.u.set_variable.vendor_guid, vendor, sizeof(*vendor));
+	call.u.set_variable.size = data_size;
+	set_xen_guest_handle(call.u.set_variable.data, data);
+
+	return HYPERVISOR_platform_op(&op) ? EFI_UNSUPPORTED : call.status;
+}
+
+static efi_status_t xen_efi_query_variable_info(u32 attr,
+						u64 *storage_space,
+						u64 *remaining_space,
+						u64 *max_variable_size)
+{
+	int err;
+	DECLARE_CALL(query_variable_info);
+
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	err = HYPERVISOR_platform_op(&op);
+	if (err)
+		return EFI_UNSUPPORTED;
+
+	*storage_space = call.u.query_variable_info.max_store_size;
+	*remaining_space = call.u.query_variable_info.remain_store_size;
+	*max_variable_size = call.u.query_variable_info.max_size;
+
+	return call.status;
+}
+
+static efi_status_t xen_efi_get_next_high_mono_count(u32 *count)
+{
+	int err;
+	DECLARE_CALL(get_next_high_monotonic_count);
+
+	err = HYPERVISOR_platform_op(&op);
+	if (err)
+		return EFI_UNSUPPORTED;
+
+	*count = call.misc;
+
+	return call.status;
+}
+
+static efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules,
+					   unsigned long count,
+					   unsigned long sg_list)
+{
+	DECLARE_CALL(update_capsule);
+
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	set_xen_guest_handle(call.u.update_capsule.capsule_header_array,
+			     capsules);
+	call.u.update_capsule.capsule_count = count;
+	call.u.update_capsule.sg_list = sg_list;
+
+	return HYPERVISOR_platform_op(&op) ? EFI_UNSUPPORTED : call.status;
+}
+
+static efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules,
+					       unsigned long count,
+					       u64 *max_size,
+					       int *reset_type)
+{
+	int err;
+	DECLARE_CALL(query_capsule_capabilities);
+
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	set_xen_guest_handle(call.u.query_capsule_capabilities.capsule_header_array,
+			     capsules);
+	call.u.query_capsule_capabilities.capsule_count = count;
+
+	err = HYPERVISOR_platform_op(&op);
+	if (err)
+		return EFI_UNSUPPORTED;
+
+	*max_size = call.u.query_capsule_capabilities.max_capsule_size;
+	*reset_type = call.u.query_capsule_capabilities.reset_type;
+
+	return call.status;
+}
+
+#undef DECLARE_CALL
+#undef call
+
+struct efi __read_mostly efi = {
+	.mps                      = EFI_INVALID_TABLE_ADDR,
+	.acpi                     = EFI_INVALID_TABLE_ADDR,
+	.acpi20                   = EFI_INVALID_TABLE_ADDR,
+	.smbios                   = EFI_INVALID_TABLE_ADDR,
+	.sal_systab               = EFI_INVALID_TABLE_ADDR,
+	.boot_info                = EFI_INVALID_TABLE_ADDR,
+	.hcdp                     = EFI_INVALID_TABLE_ADDR,
+	.uga                      = EFI_INVALID_TABLE_ADDR,
+	.uv_systab                = EFI_INVALID_TABLE_ADDR,
+	.get_time                 = xen_efi_get_time,
+	.set_time                 = xen_efi_set_time,
+	.get_wakeup_time          = xen_efi_get_wakeup_time,
+	.set_wakeup_time          = xen_efi_set_wakeup_time,
+	.get_variable             = xen_efi_get_variable,
+	.get_next_variable        = xen_efi_get_next_variable,
+	.set_variable             = xen_efi_set_variable,
+	.get_next_high_mono_count = xen_efi_get_next_high_mono_count,
+	.query_variable_info      = xen_efi_query_variable_info,
+	.update_capsule           = xen_efi_update_capsule,
+	.query_capsule_caps       = xen_efi_query_capsule_caps,
+};
+EXPORT_SYMBOL(efi);
+
+static int __init setup_noefi(char *arg)
+{
+	efi_enabled = 0;
+	return 0;
+}
+early_param("noefi", setup_noefi);
+
+
+int efi_set_rtc_mmss(unsigned long nowtime)
+{
+	int real_seconds, real_minutes;
+	efi_status_t 	status;
+	efi_time_t 	eft;
+	efi_time_cap_t 	cap;
+
+	status = efi.get_time(&eft, &cap);
+	if (status != EFI_SUCCESS) {
+		printk(KERN_ERR "Oops: efitime: can't read time!\n");
+		return -1;
+	}
+
+	real_seconds = nowtime % 60;
+	real_minutes = nowtime / 60;
+	if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
+		real_minutes += 30;
+	real_minutes %= 60;
+	eft.minute = real_minutes;
+	eft.second = real_seconds;
+
+	status = efi.set_time(&eft);
+	if (status != EFI_SUCCESS) {
+		printk(KERN_ERR "Oops: efitime: can't write time!\n");
+		return -1;
+	}
+	return 0;
+}
+
+unsigned long efi_get_time(void)
+{
+	efi_status_t status;
+	efi_time_t eft;
+	efi_time_cap_t cap;
+
+	status = efi.get_time(&eft, &cap);
+	if (status != EFI_SUCCESS) {
+		printk(KERN_ERR "Oops: efitime: can't read time!\n");
+		return mach_get_cmos_time();
+	}
+
+	return mktime(eft.year, eft.month, eft.day, eft.hour,
+		      eft.minute, eft.second);
+}
+
+void __init efi_probe(void)
+{
+	static struct xen_platform_op __initdata op = {
+		.cmd = XENPF_firmware_info,
+		.u.firmware_info = {
+			.type = XEN_FW_EFI_INFO,
+			.index = XEN_FW_EFI_CONFIG_TABLE
+		}
+	};
+
+	if (HYPERVISOR_platform_op(&op) == 0)
+		efi_enabled = 1;
+}
+
+void __init efi_reserve_boot_services(void) { }
+
+void __init efi_init(void)
+{
+	efi_config_table_t *config_tables;
+	efi_char16_t c16[100];
+	char vendor[ARRAY_SIZE(c16)] = "unknown";
+	int ret, i;
+	struct xen_platform_op op;
+	union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info;
+
+	op.cmd = XENPF_firmware_info;
+	op.u.firmware_info.type = XEN_FW_EFI_INFO;
+
+	/*
+	 * Show what we know for posterity
+	 */
+	op.u.firmware_info.index = XEN_FW_EFI_VENDOR;
+	info->vendor.bufsz = sizeof(c16);
+	set_xen_guest_handle(info->vendor.name, c16);
+	ret = HYPERVISOR_platform_op(&op);
+	if (!ret) {
+		for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i)
+			vendor[i] = c16[i];
+		vendor[i] = '\0';
+	} else
+		printk(KERN_ERR PFX "Could not get the firmware vendor!\n");
+
+	op.u.firmware_info.index = XEN_FW_EFI_VERSION;
+	ret = HYPERVISOR_platform_op(&op);
+	if (!ret)
+		printk(KERN_INFO "EFI v%u.%.02u by %s\n",
+		       info->version >> 16,
+		       info->version & 0xffff, vendor);
+	else
+		printk(KERN_ERR PFX "Could not get EFI revision!\n");
+
+	op.u.firmware_info.index = XEN_FW_EFI_RT_VERSION;
+	ret = HYPERVISOR_platform_op(&op);
+	if (!ret)
+		efi.runtime_version = info->version;
+	else
+		pr_warn(PFX "Could not get runtime services revision.\n");
+
+	/*
+	 * Let's see what config tables the firmware passed to us.
+	 */
+	op.u.firmware_info.index = XEN_FW_EFI_CONFIG_TABLE;
+	if (HYPERVISOR_platform_op(&op))
+		BUG();
+	config_tables = early_ioremap(
+		info->cfg.addr,
+		info->cfg.nent * sizeof(efi_config_table_t));
+	if (config_tables == NULL)
+		panic("Could not map EFI Configuration Table!\n");
+
+	printk(KERN_INFO);
+	for (i = 0; i < info->cfg.nent; i++) {
+		if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
+			efi.mps = config_tables[i].table;
+			printk(" MPS=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					ACPI_20_TABLE_GUID)) {
+			efi.acpi20 = config_tables[i].table;
+			printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					ACPI_TABLE_GUID)) {
+			efi.acpi = config_tables[i].table;
+			printk(" ACPI=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					SMBIOS_TABLE_GUID)) {
+			efi.smbios = config_tables[i].table;
+			printk(" SMBIOS=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					HCDP_TABLE_GUID)) {
+			efi.hcdp = config_tables[i].table;
+			printk(" HCDP=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
+					UGA_IO_PROTOCOL_GUID)) {
+			efi.uga = config_tables[i].table;
+			printk(" UGA=0x%lx ", config_tables[i].table);
+		}
+	}
+	printk("\n");
+	early_iounmap(config_tables, info->cfg.nent * sizeof(efi_config_table_t));
+
+	x86_platform.get_wallclock = efi_get_time;
+	x86_platform.set_wallclock = efi_set_rtc_mmss;
+}
+
+void __init efi_enter_virtual_mode(void) { }
+
+static struct platform_device rtc_efi_dev = {
+	.name = "rtc-efi",
+	.id = -1,
+};
+
+static int __init rtc_init(void)
+{
+	if (efi_enabled && platform_device_register(&rtc_efi_dev) < 0)
+		printk(KERN_ERR "unable to register rtc device...\n");
+
+	/* not necessarily an error */
+	return 0;
+}
+arch_initcall(rtc_init);
+
+/*
+ * Convenience functions to obtain memory types and attributes
+ */
+u32 efi_mem_type(unsigned long phys_addr)
+{
+	struct xen_platform_op op;
+	union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info;
+
+	op.cmd = XENPF_firmware_info;
+	op.u.firmware_info.type = XEN_FW_EFI_INFO;
+	op.u.firmware_info.index = XEN_FW_EFI_MEM_INFO;
+	info->mem.addr = phys_addr;
+	info->mem.size = 0;
+	return HYPERVISOR_platform_op(&op) ? 0 : info->mem.type;
+}
+
+u64 efi_mem_attributes(unsigned long phys_addr)
+{
+	struct xen_platform_op op;
+	union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info;
+
+	op.cmd = XENPF_firmware_info;
+	op.u.firmware_info.type = XEN_FW_EFI_INFO;
+	op.u.firmware_info.index = XEN_FW_EFI_MEM_INFO;
+	info->mem.addr = phys_addr;
+	info->mem.size = 0;
+	return HYPERVISOR_platform_op(&op) ? 0 : info->mem.attr;
+}
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index 7785b72..f37ca26 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -32,6 +32,7 @@
 #include <asm/apic.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
+#ifndef CONFIG_XEN
 static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 
 /* All CPUs enumerated by SFI must be present and enabled */
@@ -47,6 +48,9 @@ static void __cpuinit mp_sfi_register_lapic(u8 id)
 
 	generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
 }
+#else
+#define mp_sfi_register_lapic(id)
+#endif
 
 static int __init sfi_parse_cpus(struct sfi_table_header *table)
 {
@@ -86,9 +90,12 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 		pentry++;
 	}
 
+#ifndef CONFIG_XEN
 	WARN(pic_mode, KERN_WARNING
 		"SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
 	pic_mode = 0;
+#endif
+
 	return 0;
 }
 #endif /* CONFIG_X86_IO_APIC */
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index a6a198c..0832f49 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -5,3 +5,5 @@ CFLAGS_cpu.o	:= $(nostackp)
 
 obj-$(CONFIG_PM_SLEEP)		+= cpu.o
 obj-$(CONFIG_HIBERNATION)	+= hibernate_$(BITS).o hibernate_asm_$(BITS).o
+
+disabled-obj-$(CONFIG_XEN)	:= cpu.o
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 5d17950..6269432 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -79,6 +79,7 @@ obj-$(VDSO32-y)			+= vdso32-syms.lds
 vdso32.so-$(VDSO32-y)		+= int80
 vdso32.so-$(CONFIG_COMPAT)	+= syscall
 vdso32.so-$(VDSO32-y)		+= sysenter
+vdso32.so-$(CONFIG_X86_XEN)	+= syscall
 
 vdso32-images			= $(vdso32.so-y:%=vdso32-%.so)
 
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 6bc0e72..a027476 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -25,6 +25,13 @@
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
+#ifndef CONFIG_XEN
+#define using_vclock likely(gtod->clock.vclock_mode != VCLOCK_NONE)
+#else
+#define using_vclock 0
+#define VCLOCK_TSC (-1)
+#endif
+
 notrace static cycle_t vread_tsc(void)
 {
 	cycle_t ret;
@@ -158,11 +165,11 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	switch (clock) {
 	case CLOCK_REALTIME:
-		if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
+		if (using_vclock)
 			return do_realtime(ts);
 		break;
 	case CLOCK_MONOTONIC:
-		if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
+		if (using_vclock)
 			return do_monotonic(ts);
 		break;
 	case CLOCK_REALTIME_COARSE:
@@ -179,7 +186,7 @@ int clock_gettime(clockid_t, struct timespec *)
 notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	long ret;
-	if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) {
+	if (using_vclock) {
 		if (likely(tv != NULL)) {
 			BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
 				     offsetof(struct timespec, tv_nsec) ||
diff --git a/arch/x86/vdso/vdso32-setup-xen.c b/arch/x86/vdso/vdso32-setup-xen.c
new file mode 100644
index 0000000..6637fd0
--- /dev/null
+++ b/arch/x86/vdso/vdso32-setup-xen.c
@@ -0,0 +1,491 @@
+/*
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include <asm/pgtable.h>
+#include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+#include <asm/vdso.h>
+#include <asm/proto.h>
+
+#include <xen/interface/callback.h>
+
+enum {
+	VDSO_DISABLED = 0,
+	VDSO_ENABLED = 1,
+	VDSO_COMPAT = 2,
+};
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT	VDSO_COMPAT
+#else
+#define VDSO_DEFAULT	VDSO_ENABLED
+#endif
+
+#ifdef CONFIG_X86_64
+#define vdso_enabled			sysctl_vsyscall32
+#define arch_setup_additional_pages	syscall32_setup_pages
+#endif
+
+/*
+ * This is the difference between the prelinked addresses in the vDSO images
+ * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
+ * in the user address space.
+ */
+#define VDSO_ADDR_ADJUST	(VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
+
+static int __init vdso_setup(char *s)
+{
+	vdso_enabled = simple_strtoul(s, NULL, 0);
+
+	return 1;
+}
+
+/*
+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
+ * behavior on both 64-bit and 32-bit kernels.
+ * On 32-bit kernels, vdso=[012] means the same thing.
+ */
+__setup("vdso32=", vdso_setup);
+
+#ifdef CONFIG_X86_32
+__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+#endif
+
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+				unsigned offset, unsigned size)
+{
+	Elf32_Sym *sym = (void *)ehdr + offset;
+	unsigned nsym = size / sizeof(*sym);
+	unsigned i;
+
+	for(i = 0; i < nsym; i++, sym++) {
+		if (sym->st_shndx == SHN_UNDEF ||
+		    sym->st_shndx == SHN_ABS)
+			continue;  /* skip */
+
+		if (sym->st_shndx > SHN_LORESERVE) {
+			printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+			       sym->st_shndx);
+			continue;
+		}
+
+		switch(ELF_ST_TYPE(sym->st_info)) {
+		case STT_OBJECT:
+		case STT_FUNC:
+		case STT_SECTION:
+		case STT_FILE:
+			sym->st_value += VDSO_ADDR_ADJUST;
+		}
+	}
+}
+
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+	Elf32_Dyn *dyn = (void *)ehdr + offset;
+
+	for(; dyn->d_tag != DT_NULL; dyn++)
+		switch(dyn->d_tag) {
+		case DT_PLTGOT:
+		case DT_HASH:
+		case DT_STRTAB:
+		case DT_SYMTAB:
+		case DT_RELA:
+		case DT_INIT:
+		case DT_FINI:
+		case DT_REL:
+		case DT_DEBUG:
+		case DT_JMPREL:
+		case DT_VERSYM:
+		case DT_VERDEF:
+		case DT_VERNEED:
+		case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+			/* definitely pointers needing relocation */
+			dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
+			break;
+
+		case DT_ENCODING ... OLD_DT_LOOS-1:
+		case DT_LOOS ... DT_HIOS-1:
+			/* Tags above DT_ENCODING are pointers if
+			   they're even */
+			if (dyn->d_tag >= DT_ENCODING &&
+			    (dyn->d_tag & 1) == 0)
+				dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
+			break;
+
+		case DT_VERDEFNUM:
+		case DT_VERNEEDNUM:
+		case DT_FLAGS_1:
+		case DT_RELACOUNT:
+		case DT_RELCOUNT:
+		case DT_VALRNGLO ... DT_VALRNGHI:
+			/* definitely not pointers */
+			break;
+
+		case OLD_DT_LOOS ... DT_LOOS-1:
+		case DT_HIOS ... DT_VALRNGLO-1:
+		default:
+			if (dyn->d_tag > DT_ENCODING)
+				printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+				       dyn->d_tag);
+			break;
+		}
+}
+
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+	Elf32_Phdr *phdr;
+	Elf32_Shdr *shdr;
+	int i;
+
+	BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
+	       !elf_check_arch_ia32(ehdr) ||
+	       ehdr->e_type != ET_DYN);
+
+	ehdr->e_entry += VDSO_ADDR_ADJUST;
+
+	/* rebase phdrs */
+	phdr = (void *)ehdr + ehdr->e_phoff;
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
+
+		/* relocate dynamic stuff */
+		if (phdr[i].p_type == PT_DYNAMIC)
+			reloc_dyn(ehdr, phdr[i].p_offset);
+	}
+
+	/* rebase sections */
+	shdr = (void *)ehdr + ehdr->e_shoff;
+	for(i = 0; i < ehdr->e_shnum; i++) {
+		if (!(shdr[i].sh_flags & SHF_ALLOC))
+			continue;
+
+		shdr[i].sh_addr += VDSO_ADDR_ADJUST;
+
+		if (shdr[i].sh_type == SHT_SYMTAB ||
+		    shdr[i].sh_type == SHT_DYNSYM)
+			reloc_symtab(ehdr, shdr[i].sh_offset,
+				     shdr[i].sh_size);
+	}
+}
+
+static struct page *vdso32_pages[1];
+
+#ifdef CONFIG_X86_64
+
+#define	vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SYSENTER32))
+#define	vdso32_syscall()	(boot_cpu_has(X86_FEATURE_SYSCALL32))
+
+void __cpuinit syscall32_cpu_init(void)
+{
+	static const struct callback_register __cpuinitconst cstar = {
+		.type = CALLBACKTYPE_syscall32,
+		.address = (unsigned long)ia32_cstar_target
+	};
+	static const struct callback_register __cpuinitconst sysenter = {
+		.type = CALLBACKTYPE_sysenter,
+		.address = (unsigned long)ia32_sysenter_target
+	};
+
+	if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
+		setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+	if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
+		setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}
+
+#define compat_uses_vma		1
+
+static inline void map_compat_vdso(int map)
+{
+}
+
+#else  /* CONFIG_X86_32 */
+
+#define vdso32_sysenter()	(boot_cpu_has(X86_FEATURE_SEP))
+#ifndef TIF_CSTAR
+#define vdso32_syscall()	0
+#else
+#define vdso32_syscall()	(boot_cpu_has(X86_FEATURE_SYSCALL32))
+
+extern asmlinkage void ia32pv_cstar_target(void);
+static const struct callback_register __cpuinitconst cstar = {
+	.type = CALLBACKTYPE_syscall32,
+	.address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
+};
+#endif
+
+void __cpuinit enable_sep_cpu(void)
+{
+	extern asmlinkage void ia32pv_sysenter_target(void);
+	static struct callback_register __cpuinitdata sysenter = {
+		.type = CALLBACKTYPE_sysenter,
+		.address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
+	};
+
+#ifdef TIF_CSTAR
+	if (vdso32_syscall()) {
+		if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
+			BUG();
+		return;
+	}
+#endif
+
+	if (!vdso32_sysenter())
+		return;
+
+	if (xen_feature(XENFEAT_supervisor_mode_kernel))
+		sysenter.address.eip = (unsigned long)ia32_sysenter_target;
+
+	switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
+	case 0:
+		break;
+#if CONFIG_XEN_COMPAT < 0x030200
+	case -ENOSYS:
+		sysenter.type = CALLBACKTYPE_sysenter_deprecated;
+		if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
+			break;
+#endif
+	default:
+		setup_clear_cpu_cap(X86_FEATURE_SEP);
+		break;
+	}
+}
+
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+	gate_vma.vm_mm = NULL;
+	gate_vma.vm_start = FIXADDR_USER_START;
+	gate_vma.vm_end = FIXADDR_USER_END;
+	gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+	gate_vma.vm_page_prot = __P101;
+	/*
+	 * Make sure the vDSO gets into every core dump.
+	 * Dumping its contents makes post-mortem fully interpretable later
+	 * without matching up the same kernel and hardware config to see
+	 * what PC values meant.
+	 */
+	gate_vma.vm_flags |= VM_ALWAYSDUMP;
+	return 0;
+}
+
+#define compat_uses_vma		0
+
+static void map_compat_vdso(int map)
+{
+	static int vdso_mapped;
+
+	if (map == vdso_mapped)
+		return;
+
+	vdso_mapped = map;
+
+	__set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
+		     map ? PAGE_READONLY_EXEC : PAGE_NONE);
+
+	/* flush stray tlbs */
+	flush_tlb_all();
+}
+
+#endif	/* CONFIG_X86_64 */
+
+int __init sysenter_setup(void)
+{
+	void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+	const void *vsyscall;
+	size_t vsyscall_len;
+
+	vdso32_pages[0] = virt_to_page(syscall_page);
+
+#ifdef CONFIG_X86_32
+	gate_vma_init();
+
+	if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
+# ifdef TIF_CSTAR
+		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
+		    && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
+			setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
+		else
+# endif
+		{
+			setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
+			setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+		}
+	}
+#endif
+	if (vdso32_syscall()) {
+		vsyscall = &vdso32_syscall_start;
+		vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
+	} else if (vdso32_sysenter()){
+		vsyscall = &vdso32_sysenter_start;
+		vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+	} else {
+		vsyscall = &vdso32_int80_start;
+		vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
+	}
+
+	memcpy(syscall_page, vsyscall, vsyscall_len);
+	relocate_vdso(syscall_page);
+
+	return 0;
+}
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+	bool compat;
+
+	if (vdso_enabled == VDSO_DISABLED)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+
+	/* Test compat mode once here, in case someone
+	   changes it via sysctl */
+	compat = (vdso_enabled == VDSO_COMPAT);
+
+	map_compat_vdso(compat);
+
+	if (compat)
+		addr = VDSO_HIGH_BASE;
+	else {
+		addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+		if (IS_ERR_VALUE(addr)) {
+			ret = addr;
+			goto up_fail;
+		}
+	}
+
+	current->mm->context.vdso = (void *)addr;
+
+	if (compat_uses_vma || !compat) {
+		/*
+		 * MAYWRITE to allow gdb to COW and set breakpoints
+		 *
+		 * Make sure the vDSO gets into every core dump.
+		 * Dumping its contents makes post-mortem fully
+		 * interpretable later without matching up the same
+		 * kernel and hardware config to see what PC values
+		 * meant.
+		 */
+		ret = install_special_mapping(mm, addr, PAGE_SIZE,
+					      VM_READ|VM_EXEC|
+					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
+					      VM_ALWAYSDUMP,
+					      vdso32_pages);
+
+		if (ret)
+			goto up_fail;
+	}
+
+	current_thread_info()->sysenter_return =
+		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
+
+  up_fail:
+	if (ret)
+		current->mm->context.vdso = NULL;
+
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+#ifdef CONFIG_X86_64
+
+subsys_initcall(sysenter_setup);
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static ctl_table abi_table2[] = {
+	{
+		.procname	= "vsyscall32",
+		.data		= &sysctl_vsyscall32,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{}
+};
+
+static ctl_table abi_root_table2[] = {
+	{
+		.procname = "abi",
+		.mode = 0555,
+		.child = abi_table2
+	},
+	{}
+};
+
+static __init int ia32_binfmt_init(void)
+{
+	register_sysctl_table(abi_root_table2);
+	return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif
+
+#else  /* CONFIG_X86_32 */
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+		return "[vdso]";
+	return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+	/*
+	 * Check to see if the corresponding task was created in compat vdso
+	 * mode.
+	 */
+	if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+		return &gate_vma;
+	return NULL;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+	const struct vm_area_struct *vma = get_gate_vma(mm);
+
+	return vma && addr >= vma->vm_start && addr < vma->vm_end;
+}
+
+int in_gate_area_no_mm(unsigned long addr)
+{
+	return 0;
+}
+
+#endif	/* CONFIG_X86_64 */
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 2ce5f82..8d4f773 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -9,7 +9,7 @@ vdso32_int80_end:
 
 	.globl vdso32_syscall_start, vdso32_syscall_end
 vdso32_syscall_start:
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
 	.incbin "arch/x86/vdso/vdso32-syscall.so"
 #endif
 vdso32_syscall_end:
diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/vdso/vdso32/note.S
index c83f257..b6ed8cd 100644
--- a/arch/x86/vdso/vdso32/note.S
+++ b/arch/x86/vdso/vdso32/note.S
@@ -13,7 +13,7 @@ ELFNOTE_START(Linux, 0, "a")
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
 
-#ifdef CONFIG_XEN
+#if defined(CONFIG_X86_XEN) || defined(CONFIG_PARAVIRT_XEN)
 /*
  * Add a special note telling glibc's dynamic linker a fake hardware
  * flavor that it will use to choose the search path for libraries in the
@@ -37,8 +37,12 @@ ELFNOTE_END
 
 ELFNOTE_START(GNU, 2, "a")
 	.long 1			/* ncaps */
+#ifdef CONFIG_PARAVIRT_XEN
 VDSO32_NOTE_MASK:		/* Symbol used by arch/x86/xen/setup.c */
 	.long 0			/* mask */
+#else
+	.long 1 << VDSO_NOTE_NONEGSEG_BIT /* mask */
+#endif
 	.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg"	/* bit, name */
 ELFNOTE_END
 #endif
diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S
index 5415b56..0a27d17 100644
--- a/arch/x86/vdso/vdso32/syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@ -19,8 +19,10 @@ __kernel_vsyscall:
 .Lpush_ebp:
 	movl	%ecx, %ebp
 	syscall
+#ifndef CONFIG_XEN
 	movl	$__USER32_DS, %ecx
 	movl	%ecx, %ss
+#endif
 	movl	%ebp, %ecx
 	popl	%ebp
 .Lpop_ebp:
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index fdce49c..0bc8165 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -2,7 +2,7 @@
 # This Kconfig describes xen options
 #
 
-config XEN
+config PARAVIRT_XEN
 	bool "Xen guest support"
 	select PARAVIRT
 	select PARAVIRT_CLOCK
@@ -15,36 +15,39 @@ config XEN
 
 config XEN_DOM0
 	def_bool y
-	depends on XEN && PCI_XEN && SWIOTLB_XEN
+	depends on PARAVIRT_XEN && PCI_XEN && SWIOTLB_XEN
 	depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
 
 # Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
 # name in tools.
-config XEN_PRIVILEGED_GUEST
-	def_bool XEN_DOM0
+# This doesn't work together with our identical symbol in drivers/xen/Kconfig
+# (produces a recursive dependency), and renaming it is pointless given that
+# it's meant as a compatibility thing.
+#config XEN_PRIVILEGED_GUEST
+#	def_bool XEN_DOM0
 
 config XEN_PVHVM
 	def_bool y
-	depends on XEN && PCI && X86_LOCAL_APIC
+	depends on PARAVIRT_XEN && PCI && X86_LOCAL_APIC
 
 config XEN_MAX_DOMAIN_MEMORY
        int
        default 500 if X86_64
        default 64 if X86_32
-       depends on XEN
+       depends on PARAVIRT_XEN
        help
          This only affects the sizing of some bss arrays, the unused
          portions of which are freed.
 
 config XEN_SAVE_RESTORE
        bool
-       depends on XEN
+       depends on PARAVIRT_XEN
        select HIBERNATE_CALLBACKS
        default y
 
 config XEN_DEBUG_FS
 	bool "Enable Xen debug and tuning parameters in debugfs"
-	depends on XEN && DEBUG_FS
+	depends on PARAVIRT_XEN && DEBUG_FS
 	default n
 	help
 	  Enable statistics output and various tuning options in debugfs.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 12eb07b..db034b6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -115,8 +115,8 @@ static int have_vcpu_info_placement = 1;
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
-	if (setup_max_cpus > MAX_VIRT_CPUS)
-		setup_max_cpus = MAX_VIRT_CPUS;
+	if (setup_max_cpus > XEN_LEGACY_MAX_VCPUS)
+		setup_max_cpus = XEN_LEGACY_MAX_VCPUS;
 #endif
 }
 
@@ -128,11 +128,11 @@ static void xen_vcpu_setup(int cpu)
 
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
-	if (cpu < MAX_VIRT_CPUS)
+	if (cpu < XEN_LEGACY_MAX_VCPUS)
 		per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
 
 	if (!have_vcpu_info_placement) {
-		if (cpu >= MAX_VIRT_CPUS)
+		if (cpu >= XEN_LEGACY_MAX_VCPUS)
 			clamp_max_cpus();
 		return;
 	}
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index aaa7291..de87595 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -1,7 +1,7 @@
 /* Xen-specific pieces of head.S, intended to be included in the right
 	place in head.S */
 
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
 
 #include <linux/elfnote.h>
 #include <linux/init.h>
@@ -52,4 +52,4 @@ ENTRY(hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   _ASM_PTR __HYPERVISOR_VIRT_START)
 	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   _ASM_PTR 0)
 
-#endif /*CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
diff --git a/drivers/Makefile b/drivers/Makefile
index c07be02..bf5621e 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_ARM_AMBA)		+= amba/
 obj-$(CONFIG_DMA_ENGINE)	+= dma/
 
 obj-$(CONFIG_VIRTIO)		+= virtio/
-obj-$(CONFIG_XEN)		+= xen/
+obj-$(CONFIG_PARAVIRT_XEN)	+= xen/
 
 # regulators early, since some subsystems rely on them to initialize
 obj-$(CONFIG_REGULATOR)		+= regulator/
@@ -47,6 +47,7 @@ obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
+obj-$(CONFIG_XEN)		+= xen/
 obj-$(CONFIG_IDE)		+= ide/
 obj-$(CONFIG_SCSI)		+= scsi/
 obj-$(CONFIG_ATA)		+= ata/
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 7113327..b49cfe1 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -184,7 +184,7 @@ config ACPI_DOCK
 config ACPI_PROCESSOR
 	tristate "Processor"
 	select THERMAL
-	select CPU_IDLE
+	select CPU_IDLE if !PROCESSOR_EXTERNAL_CONTROL
 	default y
 	help
 	  This driver installs ACPI as the idle handler for Linux and uses
@@ -216,7 +216,7 @@ config ACPI_PROCESSOR_AGGREGATOR
 	tristate "Processor Aggregator"
 	depends on ACPI_PROCESSOR
 	depends on EXPERIMENTAL
-	depends on X86
+	depends on X86 && !XEN
 	help
 	  ACPI 4.0 defines processor Aggregator, which enables OS to perform
 	  specific processor configuration and control that applies to all
@@ -318,6 +318,7 @@ config ACPI_PCI_SLOT
 config X86_PM_TIMER
 	bool "Power Management Timer Support" if EXPERT
 	depends on X86
+	depends on !XEN
 	default y
 	help
 	  The Power Management Timer is available on all ACPI-capable,
@@ -346,7 +347,7 @@ config ACPI_CONTAINER
 
 config ACPI_HOTPLUG_MEMORY
 	tristate "Memory Hotplug"
-	depends on MEMORY_HOTPLUG
+	depends on MEMORY_HOTPLUG || XEN_PRIVILEGED_GUEST
 	default n
 	help
 	  This driver supports ACPI memory hotplug.  The driver
@@ -396,4 +397,13 @@ config ACPI_CUSTOM_METHOD
 
 source "drivers/acpi/apei/Kconfig"
 
+config ACPI_PV_SLEEP
+	bool
+	depends on X86 && XEN && ACPI_SLEEP
+	default y
+
+config PROCESSOR_EXTERNAL_CONTROL
+	bool
+	depends on (X86 || IA64) && XEN
+	default y
 endif	# ACPI
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 1567028..a6d6a8b 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_ACPI_CUSTOM_METHOD)+= custom_method.o
 processor-y			:= processor_driver.o processor_throttling.o
 processor-y			+= processor_idle.o processor_thermal.o
 processor-$(CONFIG_CPU_FREQ)	+= processor_perflib.o
+processor-$(CONFIG_PROCESSOR_EXTERNAL_CONTROL) += processor_perflib.o processor_extcntl.o
 
 obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
 obj-$(CONFIG_ACPI_IPMI)		+= acpi_ipmi.o
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index d985713..a63bc73 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -88,6 +88,14 @@ struct acpi_memory_device {
 
 static int acpi_hotmem_initialized;
 
+#ifdef CONFIG_XEN
+#include "../xen/core/acpi_memhotplug.c"
+#define memory_add_physaddr_to_nid(start) 0
+#else
+static inline int xen_hotadd_mem_init(void) { return 0; }
+static inline void xen_hotadd_mem_exit(void) {}
+#endif
+
 static acpi_status
 acpi_memory_get_resource(struct acpi_resource *resource, void *context)
 {
@@ -229,6 +237,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 		return result;
 	}
 
+#ifdef CONFIG_XEN
+	return xen_hotadd_memory(mem_device);
+#endif
+
 	node = acpi_get_node(mem_device->device->handle);
 	/*
 	 * Tell the VM there is more memory here...
@@ -312,6 +324,10 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
 	struct acpi_memory_info *info, *n;
 
 
+#ifdef CONFIG_XEN
+	return -EOPNOTSUPP;
+#endif
+
 	/*
 	 * Ask the VM to offline this memory range.
 	 * Note: Assume that this function returns zero on success
@@ -531,6 +547,10 @@ static int __init acpi_memory_device_init(void)
 	acpi_status status;
 
 
+	result = xen_hotadd_mem_init();
+	if (result < 0)
+		return result;
+
 	result = acpi_bus_register_driver(&acpi_memory_device_driver);
 
 	if (result < 0)
@@ -570,6 +590,8 @@ static void __exit acpi_memory_device_exit(void)
 
 	acpi_bus_unregister_driver(&acpi_memory_device_driver);
 
+	xen_hotadd_mem_exit();
+
 	return;
 }
 
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index 3c4a922..cfcbcc2 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -237,7 +237,11 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
 	u32 pm1b_control;
 	struct acpi_bit_register_info *sleep_type_reg_info;
 	struct acpi_bit_register_info *sleep_enable_reg_info;
+#if !(defined(CONFIG_XEN) && defined(CONFIG_X86))
 	u32 in_value;
+#else
+	int err;
+#endif
 	struct acpi_object_list arg_list;
 	union acpi_object arg;
 	acpi_status status;
@@ -348,6 +352,7 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
 
 	/* Write #2: Write both SLP_TYP + SLP_EN */
 
+#if !(defined(CONFIG_XEN) && defined(CONFIG_X86))
 	status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
 	if (ACPI_FAILURE(status)) {
 		return_ACPI_STATUS(status);
@@ -387,6 +392,15 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
 		/* Spin until we wake */
 
 	} while (!in_value);
+#else
+	/* PV ACPI just need check hypercall return value */
+	err = acpi_notify_hypervisor_state(sleep_state,
+			pm1a_control, pm1b_control);
+	if (err) {
+		printk(KERN_ERR "ACPI: Hypervisor failure [%d]\n", err);
+		return_ACPI_STATUS(AE_ERROR);
+	}
+#endif
 
 	return_ACPI_STATUS(AE_OK);
 }
@@ -405,6 +419,7 @@ ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state)
  *              THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED
  *
  ******************************************************************************/
+#ifndef CONFIG_XEN
 acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void)
 {
 	u32 in_value;
@@ -458,6 +473,7 @@ acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void)
 }
 
 ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state_s4bios)
+#endif
 
 /*******************************************************************************
  *
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 2e02545..45adfd3 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -324,8 +324,12 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size)
 }
 
 #ifndef CONFIG_IA64
+#ifndef CONFIG_XEN
 #define should_use_kmap(pfn)   page_is_ram(pfn)
 #else
+#define should_use_kmap(mfn)   pfn_valid(pfn = mfn_to_local_pfn(mfn))
+#endif
+#else
 /* ioremap will take care of cache attributes */
 #define should_use_kmap(pfn)   0
 #endif
@@ -348,7 +352,7 @@ static void acpi_unmap(acpi_physical_address pg_off, void __iomem *vaddr)
 	unsigned long pfn;
 
 	pfn = pg_off >> PAGE_SHIFT;
-	if (page_is_ram(pfn))
+	if (should_use_kmap(pfn))
 		kunmap(pfn_to_page(pfn));
 	else
 		iounmap(vaddr);
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 0eefa12..b4ad988 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -527,3 +527,80 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
 	dev_dbg(&dev->dev, "PCI INT %c disabled\n", pin_name(pin));
 	acpi_unregister_gsi(gsi);
 }
+
+#if defined(CONFIG_XEN) && defined(CONFIG_PCI)
+static int __init xen_setup_gsi(void)
+{
+	struct pci_dev *dev = NULL;
+
+	if (acpi_noirq)
+		return 0;
+
+	/* Loop body is a clone of acpi_pci_irq_enable(). */
+	for_each_pci_dev(dev) {
+		const struct acpi_prt_entry *entry;
+		int gsi;
+		int triggering = ACPI_LEVEL_SENSITIVE;
+		int polarity = ACPI_ACTIVE_LOW;
+		struct physdev_setup_gsi setup_gsi;
+
+		if (!dev->pin)
+			continue;
+
+		entry = acpi_pci_irq_lookup(dev, dev->pin);
+		if (!entry) {
+			/*
+			 * IDE legacy mode controller IRQs are magic. Why do
+			 * compat extensions always make such a nasty mess.
+			 */
+			if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE &&
+			    (dev->class & 0x05) == 0)
+				continue;
+		}
+
+		gsi = entry
+		      ? entry->link
+			? acpi_pci_link_allocate_irq(entry->link,
+						     entry->index,
+						     &triggering, &polarity,
+						     NULL)
+			: entry->index
+		      : -1;
+
+		if (gsi >= 0) {
+			setup_gsi.gsi = gsi;
+			setup_gsi.triggering
+				= (triggering == ACPI_LEVEL_SENSITIVE);
+			setup_gsi.polarity = (polarity == ACPI_ACTIVE_LOW);
+			if (HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
+						  &setup_gsi) < 0)
+				continue;
+
+			dev_info(&dev->dev, "GSI%d: %s-%s\n", gsi,
+				 triggering == ACPI_LEVEL_SENSITIVE ? "level"
+								    : "edge",
+				 polarity == ACPI_ACTIVE_LOW ? "low" : "high");
+		} else {
+			/*
+			 * No IRQ known to the ACPI subsystem - maybe the
+			 * BIOS / driver reported one, then use it.
+			 */
+			dev_warn(&dev->dev, "PCI INT %c: no GSI",
+				 pin_name(dev->pin));
+			/* Interrupt Line values above 0xF are forbidden */
+			if (dev->irq > 0 && (dev->irq <= 0xF)) {
+				pr_cont(" - using IRQ %d\n", dev->irq);
+				setup_gsi.gsi = dev->irq;
+				setup_gsi.triggering = 1;
+				setup_gsi.polarity = 1;
+				VOID(HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
+							   &setup_gsi));
+			} else
+				pr_cont("\n");
+		}
+	}
+
+	return 0;
+}
+subsys_initcall(xen_setup_gsi);
+#endif
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 7aff631..508ad3e 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -449,6 +449,41 @@ out:
 }
 EXPORT_SYMBOL(acpi_pci_osc_control_set);
 
+#ifdef CONFIG_PCI_GUESTDEV
+#include <linux/sysfs.h>
+
+static ssize_t seg_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct list_head *entry;
+
+	list_for_each(entry, &acpi_pci_roots) {
+		struct acpi_pci_root *root;
+		root = list_entry(entry, struct acpi_pci_root, node);
+		if (&root->device->dev == dev)
+			return sprintf(buf, "%04x\n", root->segment);
+	}
+	return 0;
+}
+static DEVICE_ATTR(seg, 0444, seg_show, NULL);
+
+static ssize_t bbn_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct list_head *entry;
+
+	list_for_each(entry, &acpi_pci_roots) {
+		struct acpi_pci_root *root;
+		root = list_entry(entry, struct acpi_pci_root, node);
+		if (&root->device->dev == dev)
+			return sprintf(buf, "%02x\n",
+				       (unsigned int)root->secondary.start);
+	}
+	return 0;
+}
+static DEVICE_ATTR(bbn, 0444, bbn_show, NULL);
+#endif
+
 static int __devinit acpi_pci_root_add(struct acpi_device *device)
 {
 	unsigned long long segment, bus;
@@ -618,6 +653,13 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 			 "(_OSC support mask: 0x%02x)\n", flags);
 	}
 
+#ifdef CONFIG_PCI_GUESTDEV
+	if (device_create_file(&device->dev, &dev_attr_seg))
+		dev_warn(&device->dev, "could not create seg attr\n");
+	if (device_create_file(&device->dev, &dev_attr_bbn))
+		dev_warn(&device->dev, "could not create bbn attr\n");
+#endif
+
 	pci_acpi_add_bus_pm_notifier(device, root->bus);
 	if (device->wakeup.flags.run_wake)
 		device_set_run_wake(root->bus->bridge, true);
@@ -665,3 +707,31 @@ static int __init acpi_pci_root_init(void)
 }
 
 subsys_initcall(acpi_pci_root_init);
+
+#ifdef CONFIG_PCI_GUESTDEV
+int acpi_pci_get_root_seg_bbn(char *hid, char *uid, int *seg, int *bbn)
+{
+	struct list_head *entry;
+
+	list_for_each(entry, &acpi_pci_roots) {
+		struct acpi_pci_root *root;
+
+		root = list_entry(entry, struct acpi_pci_root, node);
+		if (strcmp(acpi_device_hid(root->device), hid))
+			continue;
+
+		if (!root->device->pnp.unique_id) {
+			if (strlen(uid))
+				continue;
+		} else {
+			if (strcmp(root->device->pnp.unique_id, uid))
+				continue;
+		}
+
+		*seg = (int)root->segment;
+		*bbn = (int)root->secondary.start;
+		return TRUE;
+	}
+	return FALSE;
+}
+#endif
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index c850de4..bc347b0 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -20,6 +20,15 @@
 #define _COMPONENT		ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_core");
 
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+/*
+ * External processor control logic may register with its own set of
+ * ops to get ACPI related notification. One example is like VMM.
+ */
+const struct processor_extcntl_ops *processor_extcntl_ops;
+EXPORT_SYMBOL(processor_extcntl_ops);
+#endif
+
 static int __init set_no_mwait(const struct dmi_system_id *id)
 {
 	printk(KERN_NOTICE PREFIX "%s detected - "
@@ -165,15 +174,19 @@ exit:
 
 int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
 {
-#ifdef CONFIG_SMP
-	int i;
-#endif
-	int apic_id = -1;
+	int i = 0, apic_id = -1;
+
+	if (type < 0) {
+		if (!processor_cntl_external())
+			return -1;
+		type = ~type;
+		i = 1;
+	}
 
 	apic_id = map_mat_entry(handle, type, acpi_id);
 	if (apic_id == -1)
 		apic_id = map_madt_entry(type, acpi_id);
-	if (apic_id == -1) {
+	if (apic_id == -1 || i) {
 		/*
 		 * On UP processor, there is no _MAT or MADT table.
 		 * So above apic_id is always set to -1.
@@ -192,18 +205,28 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
 		 * Ignores apic_id and always return 0 for CPU0's handle.
 		 * Return -1 for other CPU's handle.
 		 */
-		if (acpi_id == 0)
+		if (acpi_id == 0 && !i)
 			return acpi_id;
 		else
 			return apic_id;
 	}
 
 #ifdef CONFIG_SMP
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 	for_each_possible_cpu(i) {
 		if (cpu_physical_id(i) == apic_id)
 			return i;
 	}
 #else
+	/*
+	 * Use of cpu_physical_id() is bogus here. Rather than defining a
+	 * stub enforcing a 1:1 mapping, we keep it undefined to catch bad
+	 * uses. Return as if there was a 1:1 mapping.
+	 */
+	if (apic_id < nr_cpu_ids && cpu_possible(apic_id))
+		return apic_id;
+#endif
+#else
 	/* In UP kernel, only processor 0 is valid */
 	if (apic_id == 0)
 		return apic_id;
@@ -244,6 +267,8 @@ static bool __init processor_physically_present(acpi_handle handle)
 	}
 
 	type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
+	if (processor_cntl_external())
+		type = ~type;
 	cpuid = acpi_get_cpuid(handle, type, acpi_id);
 
 	if (cpuid == -1)
@@ -312,19 +337,31 @@ acpi_processor_eval_pdc(acpi_handle handle, struct acpi_object_list *pdc_in)
 {
 	acpi_status status = AE_OK;
 
+#ifndef CONFIG_XEN
 	if (boot_option_idle_override == IDLE_NOMWAIT) {
 		/*
 		 * If mwait is disabled for CPU C-states, the C2C3_FFH access
 		 * mode will be disabled in the parameter of _PDC object.
 		 * Of course C1_FFH access mode will also be disabled.
 		 */
+#else
+	{
+		struct xen_platform_op op;
+#endif
 		union acpi_object *obj;
 		u32 *buffer = NULL;
 
 		obj = pdc_in->pointer;
 		buffer = (u32 *)(obj->buffer.pointer);
+#ifndef CONFIG_XEN
 		buffer[2] &= ~(ACPI_PDC_C_C2C3_FFH | ACPI_PDC_C_C1_FFH);
-
+#else
+		op.cmd = XENPF_set_processor_pminfo;
+		op.u.set_pminfo.id = -1;
+		op.u.set_pminfo.type = XEN_PM_PDC;
+		set_xen_guest_handle(op.u.set_pminfo.u.pdc, buffer);
+		VOID(HYPERVISOR_platform_op(&op));
+#endif
 	}
 	status = acpi_evaluate_object(handle, "_PDC", pdc_in, NULL);
 
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 2801b41..a484bd7 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -110,7 +110,9 @@ static struct acpi_driver acpi_processor_driver = {
 #define UNINSTALL_NOTIFY_HANDLER	2
 
 DEFINE_PER_CPU(struct acpi_processor *, processors);
+#ifndef CONFIG_XEN
 EXPORT_PER_CPU_SYMBOL(processors);
+#endif
 
 struct acpi_processor_errata errata __read_mostly;
 
@@ -324,9 +326,16 @@ static int acpi_processor_get_info(struct acpi_device *device)
 	 *  they are physically not present.
 	 */
 	if (pr->id == -1) {
-		if (ACPI_FAILURE(acpi_processor_hotadd_init(pr)))
+		if (ACPI_FAILURE(acpi_processor_hotadd_init(pr)) &&
+		    acpi_get_cpuid(pr->handle, ~device_declaration,
+				   pr->acpi_id) < 0)
 			return -ENODEV;
 	}
+#if defined(CONFIG_SMP) && defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
+	if (pr->id >= setup_max_cpus && pr->id > 0)
+		pr->id = -1;
+#endif
+
 	/*
 	 * On some boxes several processors use the same processor bus id.
 	 * But they are located in different scope. For example:
@@ -336,7 +345,14 @@ static int acpi_processor_get_info(struct acpi_device *device)
 	 * generated as the following format:
 	 * CPU+CPU ID.
 	 */
-	sprintf(acpi_device_bid(device), "CPU%X", pr->id);
+	if (pr->id != -1)
+		sprintf(acpi_device_bid(device), "CPU%X", pr->id);
+	else
+		snprintf(acpi_device_bid(device),
+			 ARRAY_SIZE(acpi_device_bid(device)),
+			 "#%0*X",
+			 (int)ARRAY_SIZE(acpi_device_bid(device)) - 2,
+			 pr->acpi_id);
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id,
 			  pr->acpi_id));
 
@@ -368,13 +384,20 @@ static int acpi_processor_get_info(struct acpi_device *device)
 	 * of /proc/cpuinfo
 	 */
 	status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
-	if (ACPI_SUCCESS(status))
+	if (ACPI_SUCCESS(status) && pr->id != -1)
 		arch_fix_phys_package_id(pr->id, object.integer.value);
 
 	return 0;
 }
 
+#ifndef CONFIG_XEN
 static DEFINE_PER_CPU(void *, processor_device_array);
+#else
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+static DEFINE_MUTEX(processor_device_mutex);
+static RADIX_TREE(processor_device_tree, GFP_KERNEL);
+#endif
 
 static void acpi_processor_notify(struct acpi_device *device, u32 event)
 {
@@ -469,19 +492,38 @@ static struct notifier_block acpi_cpu_notifier =
  */
 static __ref int acpi_processor_start(struct acpi_processor *pr)
 {
+#ifndef CONFIG_XEN
 	struct acpi_device *device = per_cpu(processor_device_array, pr->id);
+#else
+	struct acpi_device *device = radix_tree_lookup(&processor_device_tree, pr->acpi_id);
+#endif
 	int result = 0;
 
-#ifdef CONFIG_CPU_FREQ
+#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
 	acpi_processor_ppc_has_changed(pr, 0);
+#endif
+#ifdef CONFIG_CPU_FREQ
 	acpi_processor_load_module(pr);
 #endif
-	acpi_processor_get_throttling_info(pr);
-	acpi_processor_get_limit_info(pr);
+	/*
+	 * pr->id may equal to -1 while processor_cntl_external enabled.
+	 * throttle and thermal module don't support this case.
+	 * Tx only works when dom0 vcpu == pcpu num by far, as we give
+	 * control to dom0.
+	 */
+	if (pr->id != -1) {
+		acpi_processor_get_throttling_info(pr);
+		acpi_processor_get_limit_info(pr);
+	}
 
-	if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver)
+	if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver
+	    || processor_pm_external())
 		acpi_processor_power_init(pr, device);
 
+	result = processor_extcntl_prepare(pr);
+	if (result)
+		goto err_power_exit;
+
 	pr->cdev = thermal_cooling_device_register("Processor", device,
 						   &processor_cooling_ops);
 	if (IS_ERR(pr->cdev)) {
@@ -546,32 +588,57 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 	device->driver_data = pr;
 
 	result = acpi_processor_get_info(device);
-	if (result) {
+	if (result ||
+	    ((pr->id == -1) && !processor_cntl_external())) {
 		/* Processor is physically not present */
 		return 0;
 	}
 
 #ifdef CONFIG_SMP
-	if (pr->id >= setup_max_cpus && pr->id != 0)
-		return 0;
+	if (pr->id >= setup_max_cpus && pr->id != 0) {
+		if (!processor_cntl_external())
+			return 0;
+		WARN_ON(pr->id != -1);
+	}
 #endif
 
-	BUG_ON((pr->id >= nr_cpu_ids) || (pr->id < 0));
+	BUG_ON(!processor_cntl_external() &&
+	       ((pr->id >= nr_cpu_ids) || (pr->id < 0)));
 
 	/*
 	 * Buggy BIOS check
 	 * ACPI id of processors can be reported wrongly by the BIOS.
 	 * Don't trust it blindly
 	 */
+#ifndef CONFIG_XEN
 	if (per_cpu(processor_device_array, pr->id) != NULL &&
 	    per_cpu(processor_device_array, pr->id) != device) {
+#else
+	mutex_lock(&processor_device_mutex);
+	result = radix_tree_insert(&processor_device_tree,
+				   pr->acpi_id, device);
+	switch (result) {
+	default:
+		goto err_unlock_free_cpumask;
+	case -EEXIST:
+		if (radix_tree_lookup(&processor_device_tree,
+				      pr->acpi_id) == device) {
+	case 0:
+			mutex_unlock(&processor_device_mutex);
+			break;
+		}
+		mutex_unlock(&processor_device_mutex);
+#endif
 		printk(KERN_WARNING "BIOS reported wrong ACPI id "
 			"for the processor\n");
 		result = -ENODEV;
 		goto err_free_cpumask;
 	}
+#ifndef CONFIG_XEN
 	per_cpu(processor_device_array, pr->id) = device;
-
+#else
+	if (pr->id != -1) {
+#endif
 	per_cpu(processors, pr->id) = pr;
 
 	dev = get_cpu_device(pr->id);
@@ -579,6 +646,9 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 		result = -EFAULT;
 		goto err_free_cpumask;
 	}
+#ifdef CONFIG_XEN
+	}
+#endif
 
 	/*
 	 * Do not start hotplugged CPUs now, but when they
@@ -596,6 +666,14 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 err_remove_sysfs:
 	sysfs_remove_link(&device->dev.kobj, "sysdev");
 err_free_cpumask:
+#ifdef CONFIG_XEN
+	mutex_lock(&processor_device_mutex);
+	if (radix_tree_lookup(&processor_device_tree,
+			      pr->acpi_id) == device)
+		radix_tree_delete(&processor_device_tree, pr->acpi_id);
+err_unlock_free_cpumask:
+	mutex_unlock(&processor_device_mutex);
+#endif
 	free_cpumask_var(pr->throttling.shared_cpu_map);
 
 	return result;
@@ -611,7 +689,7 @@ static int acpi_processor_remove(struct acpi_device *device, int type)
 
 	pr = acpi_driver_data(device);
 
-	if (pr->id >= nr_cpu_ids)
+	if (!processor_cntl_external() && pr->id >= nr_cpu_ids)
 		goto free;
 
 	if (type == ACPI_BUS_REMOVAL_EJECT) {
@@ -621,7 +699,8 @@ static int acpi_processor_remove(struct acpi_device *device, int type)
 
 	acpi_processor_power_exit(pr, device);
 
-	sysfs_remove_link(&device->dev.kobj, "sysdev");
+	if (pr->id != -1)
+		sysfs_remove_link(&device->dev.kobj, "sysdev");
 
 	if (pr->cdev) {
 		sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
@@ -630,8 +709,16 @@ static int acpi_processor_remove(struct acpi_device *device, int type)
 		pr->cdev = NULL;
 	}
 
+#ifndef CONFIG_XEN
 	per_cpu(processors, pr->id) = NULL;
 	per_cpu(processor_device_array, pr->id) = NULL;
+#else
+	if (pr->id != -1)
+		per_cpu(processors, pr->id) = NULL;
+	mutex_lock(&processor_device_mutex);
+	radix_tree_delete(&processor_device_tree, pr->acpi_id);
+	mutex_unlock(&processor_device_mutex);
+#endif
 
 free:
 	free_cpumask_var(pr->throttling.shared_cpu_map);
@@ -687,6 +774,10 @@ int acpi_processor_device_add(acpi_handle handle, struct acpi_device **device)
 		return -ENODEV;
 	}
 
+	if (processor_cntl_external() && acpi_driver_data(*device))
+		processor_notify_external(acpi_driver_data(*device),
+			PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD);
+
 	return 0;
 }
 
@@ -716,6 +807,10 @@ static void acpi_processor_hotplug_notify(acpi_handle handle,
 					    "Unable to add the device\n");
 			break;
 		}
+		pr = acpi_driver_data(device);
+		if (processor_cntl_external() && pr)
+			processor_notify_external(pr,
+					PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD);
 		break;
 	case ACPI_NOTIFY_EJECT_REQUEST:
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
@@ -732,6 +827,9 @@ static void acpi_processor_hotplug_notify(acpi_handle handle,
 				    "Driver data is NULL, dropping EJECT\n");
 			return;
 		}
+		if (processor_cntl_external())
+			processor_notify_external(pr, PROCESSOR_HOTPLUG,
+						HOTPLUG_TYPE_REMOVE);
 		break;
 	default:
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
@@ -780,10 +878,21 @@ static acpi_status acpi_processor_hotadd_init(struct acpi_processor *pr)
 {
 	acpi_handle handle = pr->handle;
 
+#ifdef CONFIG_XEN
+	if (xen_pcpu_index(pr->acpi_id, 1) != -1)
+		return AE_OK;
+#endif
+
 	if (!is_processor_present(handle)) {
 		return AE_ERROR;
 	}
 
+	if (processor_cntl_external()) {
+		processor_notify_external(pr, PROCESSOR_HOTPLUG,
+					  HOTPLUG_TYPE_ADD);
+		return AE_OK;
+	}
+
 	if (acpi_map_lsapic(handle, &pr->id))
 		return AE_ERROR;
 
@@ -808,6 +917,12 @@ static acpi_status acpi_processor_hotadd_init(struct acpi_processor *pr)
 
 static int acpi_processor_handle_eject(struct acpi_processor *pr)
 {
+	if (processor_cntl_external()) {
+		processor_notify_external(pr, PROCESSOR_HOTPLUG,
+					  HOTPLUG_TYPE_REMOVE);
+		return (0);
+	}
+
 	if (cpu_online(pr->id))
 		cpu_down(pr->id);
 
@@ -895,6 +1010,30 @@ static void __exit acpi_processor_exit(void)
 
 	acpi_bus_unregister_driver(&acpi_processor_driver);
 
+#ifdef CONFIG_XEN
+	{
+		struct acpi_device *dev;
+		unsigned int idx = 0;
+
+		while (radix_tree_gang_lookup(&processor_device_tree,
+					      (void **)&dev, idx, 1)) {
+			struct acpi_processor *pr = acpi_driver_data(dev);
+
+			/* prevent live lock */
+			if (pr->acpi_id < idx) {
+				printk(KERN_WARNING PREFIX "ID %u unexpected"
+				       " (less than %u); leaking memory\n",
+				       pr->acpi_id, idx);
+				break;
+			}
+			idx = pr->acpi_id;
+			radix_tree_delete(&processor_device_tree, idx);
+			if (!++idx)
+				break;
+		}
+	}
+#endif
+
 	return;
 }
 
diff --git a/drivers/acpi/processor_extcntl.c b/drivers/acpi/processor_extcntl.c
new file mode 100644
index 0000000..e71db44
--- /dev/null
+++ b/drivers/acpi/processor_extcntl.c
@@ -0,0 +1,214 @@
+/*
+ * processor_extcntl.c - channel to external control logic
+ *
+ *  Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+
+#include <acpi/processor.h>
+
+#define ACPI_PROCESSOR_CLASS            "processor"
+#define _COMPONENT              ACPI_PROCESSOR_COMPONENT
+ACPI_MODULE_NAME("processor_extcntl")
+
+static int processor_extcntl_parse_csd(struct acpi_processor *pr);
+static int processor_extcntl_get_performance(struct acpi_processor *pr);
+
+static int processor_notify_smm(void)
+{
+	acpi_status status;
+	static int is_done = 0;
+
+	/* only need successfully notify BIOS once */
+	/* avoid double notification which may lead to unexpected result */
+	if (is_done)
+		return 0;
+
+	/* Can't write pstate_cnt to smi_cmd if either value is zero */
+	if (!acpi_gbl_FADT.smi_command || !acpi_gbl_FADT.pstate_control) {
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO,"No SMI port or pstate_cnt\n"));
+		return 0;
+	}
+
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+		"Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n",
+		acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+
+	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+				    acpi_gbl_FADT.pstate_control, 8);
+	if (ACPI_FAILURE(status))
+		return status;
+
+	is_done = 1;
+
+	return 0;
+}
+
+int processor_notify_external(struct acpi_processor *pr, int event, int type)
+{
+	int ret = -EINVAL;
+
+	if (!processor_cntl_external())
+		return -EINVAL;
+
+	switch (event) {
+	case PROCESSOR_PM_INIT:
+	case PROCESSOR_PM_CHANGE:
+		if ((type >= PM_TYPE_MAX) ||
+			!processor_extcntl_ops->pm_ops[type])
+			break;
+
+		ret = processor_extcntl_ops->pm_ops[type](pr, event);
+		break;
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+	case PROCESSOR_HOTPLUG:
+		if (processor_extcntl_ops->hotplug)
+			ret = processor_extcntl_ops->hotplug(pr, type);
+		xen_pcpu_hotplug(type);
+		break;
+#endif
+	default:
+		pr_err("Unsupported processor event %d.\n", event);
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * This is called from ACPI processor init, and targeted to hold
+ * some tricky housekeeping jobs to satisfy external control model.
+ * For example, we may put dependency parse stub here for idle
+ * and performance state. Those information may be not available
+ * if splitting from dom0 control logic like cpufreq driver.
+ */
+int processor_extcntl_prepare(struct acpi_processor *pr)
+{
+	/* parse cstate dependency information */
+	if (processor_pm_external())
+		processor_extcntl_parse_csd(pr);
+
+	/* Initialize performance states */
+	if (processor_pmperf_external())
+		processor_extcntl_get_performance(pr);
+
+	return 0;
+}
+
+/*
+ * Currently no _CSD is implemented which is why existing ACPI code
+ * doesn't parse _CSD at all. But to keep interface complete with
+ * external control logic, we put a placeholder here for future
+ * compatibility.
+ */
+static int processor_extcntl_parse_csd(struct acpi_processor *pr)
+{
+	int i;
+
+	for (i = 0; i < pr->power.count; i++) {
+		if (!pr->power.states[i].valid)
+			continue;
+
+		/* No dependency by default */
+		pr->power.states[i].domain_info = NULL;
+		pr->power.states[i].csd_count = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Existing ACPI module does parse performance states at some point,
+ * when acpi-cpufreq driver is loaded which however is something
+ * we'd like to disable to avoid confliction with external control
+ * logic. So we have to collect raw performance information here
+ * when ACPI processor object is found and started.
+ */
+static int processor_extcntl_get_performance(struct acpi_processor *pr)
+{
+	int ret;
+	struct acpi_processor_performance *perf;
+	struct acpi_psd_package *pdomain;
+
+	if (pr->performance)
+		return -EBUSY;
+
+	perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL);
+	if (!perf)
+		return -ENOMEM;
+
+	pr->performance = perf;
+	/* Get basic performance state information */
+	ret = acpi_processor_get_performance_info(pr);
+	if (ret < 0)
+		goto err_out;
+
+	/*
+	 * Well, here we need retrieve performance dependency information
+	 * from _PSD object. The reason why existing interface is not used
+	 * is due to the reason that existing interface sticks to Linux cpu
+	 * id to construct some bitmap, however we want to split ACPI
+	 * processor objects from Linux cpu id logic. For example, even
+	 * when Linux is configured as UP, we still want to parse all ACPI
+	 * processor objects to external logic. In this case, it's preferred
+	 * to use ACPI ID instead.
+	 */
+	pdomain = &pr->performance->domain_info;
+	pdomain->num_processors = 0;
+	ret = acpi_processor_get_psd(pr);
+	if (ret < 0) {
+		/*
+		 * _PSD is optional - assume no coordination if absent (or
+		 * broken), matching native kernels' behavior.
+		 */
+		pdomain->num_entries = ACPI_PSD_REV0_ENTRIES;
+		pdomain->revision = ACPI_PSD_REV0_REVISION;
+		pdomain->domain = pr->acpi_id;
+		pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL;
+		pdomain->num_processors = 1;
+	}
+
+	/* Some sanity check */
+	if ((pdomain->revision != ACPI_PSD_REV0_REVISION) ||
+	    (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) ||
+	    ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) &&
+	     (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) &&
+	     (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) {
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	/* Last step is to notify BIOS that external logic exists */
+	processor_notify_smm();
+
+	processor_notify_external(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF);
+
+	return 0;
+err_out:
+	pr->performance = NULL;
+	kfree(perf);
+	return ret;
+}
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 0e8e2de..ab879bc 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -125,6 +125,7 @@ static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = {
 };
 
 
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 /*
  * Callers should disable interrupts before the call and enable
  * interrupts after return.
@@ -143,6 +144,7 @@ static void acpi_safe_halt(void)
 	}
 	current_thread_info()->status |= TS_POLLING;
 }
+#endif
 
 #ifdef ARCH_APICTIMER_STOPS_ON_C3
 
@@ -213,7 +215,7 @@ static void lapic_timer_state_broadcast(struct acpi_processor *pr,
 static void lapic_timer_check_state(int state, struct acpi_processor *pr,
 				   struct acpi_processor_cx *cstate) { }
 static void lapic_timer_propagate_broadcast(struct acpi_processor *pr) { }
-static void lapic_timer_state_broadcast(struct acpi_processor *pr,
+static inline void lapic_timer_state_broadcast(struct acpi_processor *pr,
 				       struct acpi_processor_cx *cx,
 				       int broadcast)
 {
@@ -252,7 +254,7 @@ int acpi_processor_resume(struct acpi_device * device)
 	return 0;
 }
 
-#if defined(CONFIG_X86)
+#if defined(CONFIG_X86) && !defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
 static void tsc_check_state(int state)
 {
 	switch (boot_cpu_data.x86_vendor) {
@@ -449,7 +451,8 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 				 */
 				cx.entry_method = ACPI_CSTATE_HALT;
 				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
-			} else {
+			/* This doesn't apply to external control case */
+			} else if (!processor_pm_external()) {
 				continue;
 			}
 			if (cx.type == ACPI_STATE_C1 &&
@@ -488,6 +491,12 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 
 		cx.power = obj->integer.value;
 
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+		/* cache control methods to notify external logic */
+		if (processor_pm_external())
+			memcpy(&cx.reg, reg, sizeof(*reg));
+#endif
+
 		current_count++;
 		memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
 
@@ -509,7 +518,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 			  current_count));
 
 	/* Validate number of power states discovered */
-	if (current_count < 2)
+	if (current_count < (processor_pm_external() ? 1 : 2))
 		status = -EFAULT;
 
       end:
@@ -605,7 +614,9 @@ static int acpi_processor_power_verify(struct acpi_processor *pr)
 	unsigned int i;
 	unsigned int working = 0;
 
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 	pr->power.timer_broadcast_on_state = INT_MAX;
+#endif
 
 	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
 		struct acpi_processor_cx *cx = &pr->power.states[i];
@@ -677,6 +688,7 @@ static int acpi_processor_get_power_info(struct acpi_processor *pr)
 	return 0;
 }
 
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 /**
  * acpi_idle_bm_check - checks if bus master activity was detected
  */
@@ -1106,6 +1118,10 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr)
 
 	return 0;
 }
+#else
+static void acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr) {}
+static void acpi_processor_setup_cpuidle_states(struct acpi_processor *pr) {}
+#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
 
 int acpi_processor_hotplug(struct acpi_processor *pr)
 {
@@ -1124,6 +1140,14 @@ int acpi_processor_hotplug(struct acpi_processor *pr)
 	if (!pr->flags.power_setup_done)
 		return -ENODEV;
 
+	if (processor_pm_external()) {
+		pr->flags.power = 0;
+		ret = acpi_processor_get_power_info(pr);
+		processor_notify_external(pr,
+			PROCESSOR_PM_CHANGE, PM_TYPE_IDLE);
+		return ret;
+	}
+
 	cpuidle_pause_and_lock();
 	cpuidle_disable_device(&pr->power.dev);
 	acpi_processor_get_power_info(pr);
@@ -1201,7 +1225,6 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
 			      struct acpi_device *device)
 {
 	acpi_status status = 0;
-	int retval;
 	static int first_run;
 
 	if (disabled_by_idle_boot_param())
@@ -1232,12 +1255,15 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
 	acpi_processor_get_power_info(pr);
 	pr->flags.power_setup_done = 1;
 
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 	/*
 	 * Install the idle handler if processor power management is supported.
 	 * Note that we use previously set idle handler will be used on
 	 * platforms that only support C1.
 	 */
 	if (pr->flags.power) {
+		int retval;
+
 		/* Register acpi_idle_driver if not already registered */
 		if (!acpi_processor_registered) {
 			acpi_processor_setup_cpuidle_states(pr);
@@ -1259,6 +1285,12 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
 		}
 		acpi_processor_registered++;
 	}
+#endif
+
+	if (processor_pm_external())
+		processor_notify_external(pr,
+			PROCESSOR_PM_INIT, PM_TYPE_IDLE);
+
 	return 0;
 }
 
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 0af48a8..d199db7 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -75,6 +75,7 @@ MODULE_PARM_DESC(ignore_ppc, "If the frequency of your machine gets wrongly" \
 
 static int acpi_processor_ppc_status;
 
+#ifdef CONFIG_CPU_FREQ
 static int acpi_processor_ppc_notifier(struct notifier_block *nb,
 				       unsigned long event, void *data)
 {
@@ -117,6 +118,7 @@ static int acpi_processor_ppc_notifier(struct notifier_block *nb,
 static struct notifier_block acpi_ppc_notifier_block = {
 	.notifier_call = acpi_processor_ppc_notifier,
 };
+#endif	/* CONFIG_CPU_FREQ */
 
 static int acpi_processor_get_platform_limit(struct acpi_processor *pr)
 {
@@ -181,6 +183,12 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
 {
 	int ret;
 
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+	/* Xen hypervisor can handle cpufreq _PPC event */
+	if (ignore_ppc < 0 && processor_pmperf_external())
+		ignore_ppc = 0;
+#endif
+
 	if (ignore_ppc) {
 		/*
 		 * Only when it is notification event, the _OST object
@@ -205,7 +213,12 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
 	if (ret < 0)
 		return (ret);
 	else
+#ifdef CONFIG_CPU_FREQ
 		return cpufreq_update_policy(pr->id);
+#elif defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
+		return processor_notify_external(pr,
+				PROCESSOR_PM_CHANGE, PM_TYPE_PERF);
+#endif
 }
 
 int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
@@ -221,6 +234,7 @@ int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
 }
 EXPORT_SYMBOL(acpi_processor_get_bios_limit);
 
+#ifdef CONFIG_CPU_FREQ
 void acpi_processor_ppc_init(void)
 {
 	if (!cpufreq_register_notifier
@@ -261,6 +275,7 @@ void acpi_processor_load_module(struct acpi_processor *pr)
 	}
 	kfree(buffer.pointer);
 }
+#endif	/* CONFIG_CPU_FREQ */
 
 static int acpi_processor_get_performance_control(struct acpi_processor *pr)
 {
@@ -408,7 +423,10 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr)
 	return result;
 }
 
-static int acpi_processor_get_performance_info(struct acpi_processor *pr)
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+static
+#endif
+int acpi_processor_get_performance_info(struct acpi_processor *pr)
 {
 	int result = 0;
 	acpi_status status = AE_OK;
@@ -453,6 +471,7 @@ static int acpi_processor_get_performance_info(struct acpi_processor *pr)
 	return result;
 }
 
+#ifdef CONFIG_CPU_FREQ
 int acpi_processor_notify_smm(struct module *calling_module)
 {
 	acpi_status status;
@@ -513,8 +532,12 @@ int acpi_processor_notify_smm(struct module *calling_module)
 }
 
 EXPORT_SYMBOL(acpi_processor_notify_smm);
+#endif	/* CONFIG_CPU_FREQ */
 
-static int acpi_processor_get_psd(struct acpi_processor	*pr)
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+static
+#endif
+int acpi_processor_get_psd(struct acpi_processor *pr)
 {
 	int result = 0;
 	acpi_status status = AE_OK;
@@ -579,6 +602,8 @@ end:
 	return result;
 }
 
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+
 int acpi_processor_preregister_performance(
 		struct acpi_processor_performance __percpu *performance)
 {
@@ -794,3 +819,5 @@ acpi_processor_unregister_performance(struct acpi_processor_performance
 }
 
 EXPORT_SYMBOL(acpi_processor_unregister_performance);
+
+#endif /* !CONFIG_PROCESSOR_EXTERNAL_CONTROL */
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 8ab80ba..2a2efbe 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -175,6 +175,16 @@ acpi_device_hid_show(struct device *dev, struct device_attribute *attr, char *bu
 }
 static DEVICE_ATTR(hid, 0444, acpi_device_hid_show, NULL);
 
+#ifdef CONFIG_PCI_GUESTDEV
+static ssize_t
+acpi_device_uid_show(struct device *dev, struct device_attribute *attr, char *buf) {
+	struct acpi_device *acpi_dev = to_acpi_device(dev);
+
+	return sprintf(buf, "%s\n", acpi_dev->pnp.unique_id);
+}
+static DEVICE_ATTR(uid, 0444, acpi_device_uid_show, NULL);
+#endif
+
 static ssize_t
 acpi_device_path_show(struct device *dev, struct device_attribute *attr, char *buf) {
 	struct acpi_device *acpi_dev = to_acpi_device(dev);
@@ -217,6 +227,13 @@ static int acpi_device_setup_files(struct acpi_device *dev)
 			goto end;
 	}
 
+#ifdef CONFIG_PCI_GUESTDEV
+	if(dev->pnp.unique_id) {
+		result = device_create_file(&dev->dev, &dev_attr_uid);
+		if(result)
+			goto end;
+	}
+#endif
         /*
          * If device has _EJ0, 'eject' file is created that is used to trigger
          * hot-removal function from userland.
@@ -280,6 +297,9 @@ static void acpi_free_ids(struct acpi_device *device)
 		kfree(id->id);
 		kfree(id);
 	}
+#ifdef CONFIG_PCI_GUESTDEV
+	kfree(device->pnp.unique_id);
+#endif
 }
 
 static void acpi_device_release(struct device *dev)
@@ -1133,6 +1153,11 @@ static void acpi_device_set_id(struct acpi_device *device)
 			for (i = 0; i < cid_list->count; i++)
 				acpi_add_id(device, cid_list->ids[i].string);
 		}
+#ifdef CONFIG_PCI_GUESTDEV
+		if (info->valid & ACPI_VALID_UID)
+			device->pnp.unique_id = kstrdup(info->unique_id.string,
+							GFP_KERNEL);
+#endif
 		if (info->valid & ACPI_VALID_ADR) {
 			device->pnp.bus_address = info->address;
 			device->flags.bus_address = 1;
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index ca191ff..40eb032 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -61,6 +61,7 @@ static struct notifier_block tts_notifier = {
 static int acpi_sleep_prepare(u32 acpi_state)
 {
 #ifdef CONFIG_ACPI_SLEEP
+#ifndef CONFIG_ACPI_PV_SLEEP
 	/* do we have a wakeup address for S2 and S3? */
 	if (acpi_state == ACPI_STATE_S3) {
 		if (!acpi_wakeup_address) {
@@ -70,6 +71,7 @@ static int acpi_sleep_prepare(u32 acpi_state)
 				(acpi_physical_address)acpi_wakeup_address);
 
 	}
+#endif
 	ACPI_FLUSH_CPU_CACHE();
 #endif
 	printk(KERN_INFO PREFIX "Preparing to enter system sleep state S%d\n",
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index adf937b..ddcccce 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -108,7 +108,7 @@ static inline void register_cpu_control(struct cpu *cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-#ifdef CONFIG_KEXEC
+#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
 #include <linux/kexec.h>
 
 static ssize_t show_crash_notes(struct device *dev, struct device_attribute *attr,
@@ -256,7 +256,7 @@ int __cpuinit register_cpu(struct cpu *cpu, int num)
 	if (!error)
 		register_cpu_under_node(num, cpu_to_node(num));
 
-#ifdef CONFIG_KEXEC
+#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
 	if (!error)
 		error = device_create_file(&cpu->dev, &dev_attr_crash_notes);
 #endif
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 6829483..409536f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -489,9 +489,9 @@ config XILINX_SYSACE
 	help
 	  Include support for the Xilinx SystemACE CompactFlash interface
 
-config XEN_BLKDEV_FRONTEND
+config PARAVIRT_XEN_BLKDEV_FRONTEND
 	tristate "Xen virtual block device support"
-	depends on XEN
+	depends on PARAVIRT_XEN
 	default y
 	select XEN_XENBUS_FRONTEND
 	help
@@ -499,16 +499,16 @@ config XEN_BLKDEV_FRONTEND
 	  block device driver.  It communicates with a back-end driver
 	  in another domain which drives the actual block device.
 
-config XEN_BLKDEV_BACKEND
+config PARAVIRT_XEN_BLKDEV_BACKEND
 	tristate "Xen block-device backend driver"
-	depends on XEN_BACKEND
+	depends on PARAVIRT_XEN_BACKEND
 	help
 	  The block-device backend driver allows the kernel to export its
 	  block devices to other guests via a high-performance shared-memory
 	  interface.
 
 	  The corresponding Linux frontend driver is enabled by the
-	  CONFIG_XEN_BLKDEV_FRONTEND configuration option.
+	  CONFIG_PARAVIRT_XEN_BLKDEV_FRONTEND configuration option.
 
 	  The backend driver attaches itself to a any block device specified
 	  in the XenBus configuration. There are no limits to what the block
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 5b79505..34cad2a 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,8 +36,8 @@ obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
 obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
 obj-$(CONFIG_BLK_DEV_HD)	+= hd.o
 
-obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
-obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= xen-blkback/
+obj-$(CONFIG_PARAVIRT_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
+obj-$(CONFIG_PARAVIRT_XEN_BLKDEV_BACKEND) += xen-blkback/
 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
 obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
 obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 9baf11e..e70ff41 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -146,7 +146,9 @@
 
 #undef  FLOPPY_SILENT_DCL_CLEAR
 
+#ifndef CONFIG_XEN
 #define REALLY_SLOW_IO
+#endif
 
 #define DEBUGT 2
 
diff --git a/drivers/block/xen-blkback/Makefile b/drivers/block/xen-blkback/Makefile
index e491c1b..4513281 100644
--- a/drivers/block/xen-blkback/Makefile
+++ b/drivers/block/xen-blkback/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
+obj-$(CONFIG_PARAVIRT_XEN_BLKDEV_BACKEND) := xen-blkback.o
 
 xen-blkback-y	:= blkback.o xenbus.o
diff --git a/drivers/cdrom/Makefile b/drivers/cdrom/Makefile
index ecf85fd..99757fb 100644
--- a/drivers/cdrom/Makefile
+++ b/drivers/cdrom/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_BLK_DEV_IDECD)	+=              cdrom.o
 obj-$(CONFIG_BLK_DEV_SR)	+=              cdrom.o
 obj-$(CONFIG_PARIDE_PCD)	+=		cdrom.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+=		cdrom.o
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+=		cdrom.o
 
 obj-$(CONFIG_VIOCD)		+= viocd.o      cdrom.o
 obj-$(CONFIG_GDROM)		+= gdrom.o      cdrom.o
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index efbe406..b7ddaf4 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -534,7 +534,7 @@ config MAX_RAW_DEVS
 config HPET
 	bool "HPET - High Precision Event Timer" if (X86 || IA64)
 	default n
-	depends on ACPI
+	depends on ACPI && !XEN
 	help
 	  If you say Y here, you will have a miscdevice named "/dev/hpet/".  Each
 	  open selects one of the timers supported by the HPET.  The timers are
diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index 923f99d..e3491f5 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -31,6 +31,10 @@
 
 #include <asm/agp.h>	/* for flush_agp_cache() */
 
+#ifndef virt_to_gart
+#define virt_to_gart virt_to_phys
+#endif
+
 #define PFX "agpgart: "
 
 //#define AGP_DEBUG 1
diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c
index f7e8878..7e630bd 100644
--- a/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@ -142,7 +142,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge)
 
 	agp_bridge->gatt_table_real = (u32 *)page_dir.real;
 	agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
-	agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
+	agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
 
 	/* Get the address for the gart region.
 	 * This is a bus address even on the alpha, b/c its
@@ -155,7 +155,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge)
 
 	/* Calculate the agp offset */
 	for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
-		writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1,
+		writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1,
 			page_dir.remapped+GET_PAGE_DIR_OFF(addr));
 		readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));	/* PCI Posting. */
 	}
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 444f8b6..d2356c2 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -178,7 +178,7 @@ static const struct aper_size_info_32 amd_8151_sizes[7] =
 
 static int amd_8151_configure(void)
 {
-	unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
+	unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real);
 	int i;
 
 	if (!amd_nb_has_feature(AMD_NB_GART))
@@ -583,7 +583,7 @@ static void __devexit agp_amd64_remove(struct pci_dev *pdev)
 {
 	struct agp_bridge_data *bridge = pci_get_drvdata(pdev);
 
-	release_mem_region(virt_to_phys(bridge->gatt_table_real),
+	release_mem_region(virt_to_gart(bridge->gatt_table_real),
 			   amd64_aperture_sizes[bridge->aperture_size_idx].size);
 	agp_remove_bridge(bridge);
 	agp_put_bridge(bridge);
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index dc30e22..5c16d72 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -361,7 +361,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge)
 
 	agp_bridge->gatt_table_real = (u32 *)page_dir.real;
 	agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped;
-	agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
+	agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
 
 	/* Write out the size register */
 	current_size = A_SIZE_LVL2(agp_bridge->current_size);
@@ -391,7 +391,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge)
 
 	/* Calculate the agp offset */
 	for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
-		writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1,
+		writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1,
 			page_dir.remapped+GET_PAGE_DIR_OFF(addr));
 		readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));	/* PCI Posting. */
 	}
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c
index d607f53..cb4fceb 100644
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -227,7 +227,7 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge)
 
 		efficeon_private.l1_table[index] = page;
 
-		value = virt_to_phys((unsigned long *)page) | pati | present | index;
+		value = virt_to_gart((unsigned long *)page) | pati | present | index;
 
 		pci_write_config_dword(agp_bridge->dev,
 			EFFICEON_ATTPAGE, value);
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 17e05d1..a8dd706 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -960,7 +960,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
 
 	bridge->gatt_table = (void *)table;
 #else
-	bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
+	bridge->gatt_table = ioremap_nocache(virt_to_gart(table),
 					(PAGE_SIZE * (1 << page_order)));
 	bridge->driver->cache_flush();
 #endif
@@ -973,7 +973,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
 
 		return -ENOMEM;
 	}
-	bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real);
+	bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real);
 
 	/* AK: bogus, should encode addresses > 4GB */
 	for (i = 0; i < num_entries; i++) {
@@ -1228,7 +1228,7 @@ int agp_generic_alloc_pages(struct agp_bridge_data *bridge, struct agp_memory *m
 	}
 
 #ifdef CONFIG_X86
-	set_pages_array_uc(mem->pages, num_pages);
+	map_pages_into_agp(mem->pages, num_pages);
 #endif
 	ret = 0;
 out:
@@ -1261,7 +1261,7 @@ void agp_generic_destroy_pages(struct agp_memory *mem)
 		return;
 
 #ifdef CONFIG_X86
-	set_pages_array_wb(mem->pages, mem->page_count);
+	unmap_pages_from_agp(mem->pages, mem->page_count);
 #endif
 
 	for (i = 0; i < mem->page_count; i++) {
diff --git a/drivers/char/agp/intel-gtt.c b/drivers/char/agp/intel-gtt.c
index c92424c..376f0ca 100644
--- a/drivers/char/agp/intel-gtt.c
+++ b/drivers/char/agp/intel-gtt.c
@@ -147,8 +147,19 @@ static struct page *i8xx_alloc_pages(void)
 	if (page == NULL)
 		return NULL;
 
+#ifdef CONFIG_XEN
+	if (xen_create_contiguous_region((unsigned long)page_address(page), 2, 32)) {
+		__free_pages(page, 2);
+		return NULL;
+	}
+#endif
+
 	if (set_pages_uc(page, 4) < 0) {
 		set_pages_wb(page, 4);
+#ifdef CONFIG_XEN
+		xen_destroy_contiguous_region((unsigned long)page_address(page),
+					      2);
+#endif
 		__free_pages(page, 2);
 		return NULL;
 	}
@@ -163,6 +174,9 @@ static void i8xx_destroy_pages(struct page *page)
 		return;
 
 	set_pages_wb(page, 4);
+#ifdef CONFIG_XEN
+	xen_destroy_contiguous_region((unsigned long)page_address(page), 2);
+#endif
 	put_page(page);
 	__free_pages(page, 2);
 	atomic_dec(&agp_bridge->current_memory_agp);
@@ -268,7 +282,11 @@ static struct agp_memory *alloc_agpphysmem_i8xx(size_t pg_count, int type)
 	new->page_count = pg_count;
 	new->num_scratch_pages = pg_count;
 	new->type = AGP_PHYS_MEMORY;
+#ifndef CONFIG_XEN
 	new->physical = page_to_phys(new->pages[0]);
+#else
+	new->physical = page_to_pseudophys(new->pages[0]);
+#endif
 	return new;
 }
 
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index f02f9b0..ff98d27 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -155,7 +155,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
 	/* Create a fake scratch directory */
 	for (i = 0; i < 1024; i++) {
 		writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i);
-		writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
+		writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
 	}
 
 	retval = serverworks_create_gatt_pages(value->num_entries / 1024);
@@ -167,7 +167,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
 
 	agp_bridge->gatt_table_real = (u32 *)page_dir.real;
 	agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
-	agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
+	agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
 
 	/* Get the address for the gart region.
 	 * This is a bus address even on the alpha, b/c its
@@ -179,7 +179,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
 
 	/* Calculate the agp offset */
 	for (i = 0; i < value->num_entries / 1024; i++)
-		writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
+		writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
 
 	return 0;
 }
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index d6e9d08..764cfb1 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -87,6 +87,7 @@ void __weak unxlate_dev_mem_ptr(unsigned long phys, void *addr)
 {
 }
 
+#ifndef ARCH_HAS_DEV_MEM
 /*
  * This funcion reads the *physical* memory. The f_pos points directly to the
  * memory location.
@@ -209,6 +210,7 @@ static ssize_t write_mem(struct file *file, const char __user *buf,
 	*ppos += written;
 	return written;
 }
+#endif
 
 int __weak phys_mem_access_prot_allowed(struct file *file,
 	unsigned long pfn, unsigned long size, pgprot_t *vma_prot)
@@ -335,6 +337,9 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma)
 static int mmap_kmem(struct file *file, struct vm_area_struct *vma)
 {
 	unsigned long pfn;
+#ifdef CONFIG_XEN
+	unsigned long i, count;
+#endif
 
 	/* Turn a kernel-virtual address into a physical page frame */
 	pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT;
@@ -349,6 +354,13 @@ static int mmap_kmem(struct file *file, struct vm_area_struct *vma)
 	if (!pfn_valid(pfn))
 		return -EIO;
 
+#ifdef CONFIG_XEN
+	count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	for (i = 0; i < count; i++)
+		if ((pfn + i) != mfn_to_local_pfn(pfn_to_mfn(pfn + i)))
+			return -EIO;
+#endif
+
 	vma->vm_pgoff = pfn;
 	return mmap_mem(file, vma);
 }
@@ -740,6 +752,7 @@ static int open_port(struct inode * inode, struct file * filp)
 #define open_kmem	open_mem
 #define open_oldmem	open_mem
 
+#ifndef ARCH_HAS_DEV_MEM
 static const struct file_operations mem_fops = {
 	.llseek		= memory_lseek,
 	.read		= read_mem,
@@ -748,6 +761,9 @@ static const struct file_operations mem_fops = {
 	.open		= open_mem,
 	.get_unmapped_area = get_unmapped_area_mem,
 };
+#else
+extern const struct file_operations mem_fops;
+#endif
 
 #ifdef CONFIG_DEVKMEM
 static const struct file_operations kmem_fops = {
diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig
index 7fc75e4..a53a9d12 100644
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -63,4 +63,13 @@ config TCG_INFINEON
 	  Further information on this driver and the supported hardware
 	  can be found at http://www.trust.rub.de/projects/linux-device-driver-infineon-tpm/ 
 
+config TCG_XEN
+	tristate "XEN TPM Interface"
+	depends on XEN
+	---help---
+	  If you want to make TPM support available to a Xen user domain,
+	  say Yes and it will be accessible from within Linux.
+	  To compile this driver as a module, choose M here; the module
+	  will be called tpm_xenu.
+
 endif # TCG_TPM
diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile
index ea3a1e0..b5cea0a 100644
--- a/drivers/char/tpm/Makefile
+++ b/drivers/char/tpm/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_TCG_TIS) += tpm_tis.o
 obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
 obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
 obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
+obj-$(CONFIG_TCG_XEN) += tpm_xenu.o
+tpm_xenu-y = tpm_xen.o tpm_vtpm.o
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 0105471..6167246 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -120,6 +120,9 @@ struct tpm_chip {
 	struct dentry **bios_dir;
 
 	struct list_head list;
+#ifdef CONFIG_XEN
+	void *priv;
+#endif
 	void (*release) (struct device *);
 };
 
@@ -284,6 +287,18 @@ struct tpm_cmd_t {
 
 ssize_t	tpm_getcap(struct device *, __be32, cap_t *, const char *);
 
+#ifdef CONFIG_XEN
+static inline void *chip_get_private(const struct tpm_chip *chip)
+{
+	return chip->priv;
+}
+
+static inline void chip_set_private(struct tpm_chip *chip, void *priv)
+{
+	chip->priv = priv;
+}
+#endif
+
 extern int tpm_get_timeouts(struct tpm_chip *);
 extern void tpm_gen_interrupt(struct tpm_chip *);
 extern int tpm_do_selftest(struct tpm_chip *);
diff --git a/drivers/char/tpm/tpm_vtpm.c b/drivers/char/tpm/tpm_vtpm.c
new file mode 100644
index 0000000..4b865f4
--- /dev/null
+++ b/drivers/char/tpm/tpm_vtpm.c
@@ -0,0 +1,543 @@
+/*
+ * Copyright (C) 2006 IBM Corporation
+ *
+ * Authors:
+ * Stefan Berger <stefanb@us.ibm.com>
+ *
+ * Generic device driver part for device drivers in a virtualized
+ * environment.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include "tpm.h"
+#include "tpm_vtpm.h"
+
+/* read status bits */
+enum {
+	STATUS_BUSY = 0x01,
+	STATUS_DATA_AVAIL = 0x02,
+	STATUS_READY = 0x04
+};
+
+struct transmission {
+	struct list_head next;
+
+	unsigned char *request;
+	size_t  request_len;
+	size_t  request_buflen;
+
+	unsigned char *response;
+	size_t  response_len;
+	size_t  response_buflen;
+
+	unsigned int flags;
+};
+
+enum {
+	TRANSMISSION_FLAG_WAS_QUEUED = 0x1
+};
+
+
+enum {
+	DATAEX_FLAG_QUEUED_ONLY = 0x1
+};
+
+
+/* local variables */
+
+/* local function prototypes */
+static int _vtpm_send_queued(struct tpm_chip *chip);
+
+
+/* =============================================================
+ * Some utility functions
+ * =============================================================
+ */
+static void vtpm_state_init(struct vtpm_state *vtpms)
+{
+	vtpms->current_request = NULL;
+	spin_lock_init(&vtpms->req_list_lock);
+	init_waitqueue_head(&vtpms->req_wait_queue);
+	INIT_LIST_HEAD(&vtpms->queued_requests);
+
+	vtpms->current_response = NULL;
+	spin_lock_init(&vtpms->resp_list_lock);
+	init_waitqueue_head(&vtpms->resp_wait_queue);
+
+	vtpms->disconnect_time = jiffies;
+}
+
+
+static inline struct transmission *transmission_alloc(void)
+{
+	return kzalloc(sizeof(struct transmission), GFP_ATOMIC);
+}
+
+static unsigned char *
+transmission_set_req_buffer(struct transmission *t,
+                            unsigned char *buffer, size_t len)
+{
+	if (t->request_buflen < len) {
+		kfree(t->request);
+		t->request = kmalloc(len, GFP_KERNEL);
+		if (!t->request) {
+			t->request_buflen = 0;
+			return NULL;
+		}
+		t->request_buflen = len;
+	}
+
+	memcpy(t->request, buffer, len);
+	t->request_len = len;
+
+	return t->request;
+}
+
+static unsigned char *
+transmission_set_res_buffer(struct transmission *t,
+                            const unsigned char *buffer, size_t len)
+{
+	if (t->response_buflen < len) {
+		kfree(t->response);
+		t->response = kmalloc(len, GFP_ATOMIC);
+		if (!t->response) {
+			t->response_buflen = 0;
+			return NULL;
+		}
+		t->response_buflen = len;
+	}
+
+	memcpy(t->response, buffer, len);
+	t->response_len = len;
+
+	return t->response;
+}
+
+static inline void transmission_free(struct transmission *t)
+{
+	kfree(t->request);
+	kfree(t->response);
+	kfree(t);
+}
+
+/* =============================================================
+ * Interface with the lower layer driver
+ * =============================================================
+ */
+/*
+ * Lower layer uses this function to make a response available.
+ */
+int vtpm_vd_recv(const struct tpm_chip *chip,
+                 const unsigned char *buffer, size_t count,
+                 void *ptr)
+{
+	unsigned long flags;
+	int ret_size = 0;
+	struct transmission *t;
+	struct vtpm_state *vtpms;
+
+	vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+	/*
+	 * The list with requests must contain one request
+	 * only and the element there must be the one that
+	 * was passed to me from the front-end.
+	 */
+	spin_lock_irqsave(&vtpms->resp_list_lock, flags);
+	if (vtpms->current_request != ptr) {
+		spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+		return 0;
+	}
+
+	if ((t = vtpms->current_request)) {
+		transmission_free(t);
+		vtpms->current_request = NULL;
+	}
+
+	t = transmission_alloc();
+	if (t) {
+		if (!transmission_set_res_buffer(t, buffer, count)) {
+			transmission_free(t);
+			spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+			return -ENOMEM;
+		}
+		ret_size = count;
+		vtpms->current_response = t;
+		wake_up_interruptible(&vtpms->resp_wait_queue);
+	}
+	spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+
+	return ret_size;
+}
+
+
+/*
+ * Lower layer indicates its status (connected/disconnected)
+ */
+void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status)
+{
+	struct vtpm_state *vtpms;
+
+	vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+	vtpms->vd_status = vd_status;
+	if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
+		vtpms->disconnect_time = jiffies;
+	}
+}
+
+/* =============================================================
+ * Interface with the generic TPM driver
+ * =============================================================
+ */
+static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
+{
+	int rc = 0;
+	unsigned long flags;
+	struct vtpm_state *vtpms;
+
+	vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+	/*
+	 * Check if the previous operation only queued the command
+	 * In this case there won't be a response, so I just
+	 * return from here and reset that flag. In any other
+	 * case I should receive a response from the back-end.
+	 */
+	spin_lock_irqsave(&vtpms->resp_list_lock, flags);
+	if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
+		vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY;
+		spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+		/*
+		 * The first few commands (measurements) must be
+		 * queued since it might not be possible to talk to the
+		 * TPM, yet.
+		 * Return a response of up to 30 '0's.
+		 */
+
+		count = min_t(size_t, count, 30);
+		memset(buf, 0x0, count);
+		return count;
+	}
+	/*
+	 * Check whether something is in the responselist and if
+	 * there's nothing in the list wait for something to appear.
+	 */
+
+	if (!vtpms->current_response) {
+		spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+		interruptible_sleep_on_timeout(&vtpms->resp_wait_queue,
+		                               1000);
+		spin_lock_irqsave(&vtpms->resp_list_lock ,flags);
+	}
+
+	if (vtpms->current_response) {
+		struct transmission *t = vtpms->current_response;
+		vtpms->current_response = NULL;
+		rc = min(count, t->response_len);
+		memcpy(buf, t->response, rc);
+		transmission_free(t);
+	}
+
+	spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+	return rc;
+}
+
+static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
+{
+	int rc = 0;
+	unsigned long flags;
+	struct transmission *t = transmission_alloc();
+	struct vtpm_state *vtpms;
+
+	vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+	if (!t)
+		return -ENOMEM;
+	/*
+	 * If there's a current request, it must be the
+	 * previous request that has timed out.
+	 */
+	spin_lock_irqsave(&vtpms->req_list_lock, flags);
+	if (vtpms->current_request != NULL) {
+		printk("WARNING: Sending although there is a request outstanding.\n"
+		       "         Previous request must have timed out.\n");
+		transmission_free(vtpms->current_request);
+		vtpms->current_request = NULL;
+	}
+	spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+
+	/*
+	 * Queue the packet if the driver below is not
+	 * ready, yet, or there is any packet already
+	 * in the queue.
+	 * If the driver below is ready, unqueue all
+	 * packets first before sending our current
+	 * packet.
+	 * For each unqueued packet, except for the
+	 * last (=current) packet, call the function
+	 * tpm_xen_recv to wait for the response to come
+	 * back.
+	 */
+	if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
+		if (time_after(jiffies,
+		               vtpms->disconnect_time + HZ * 10)) {
+			rc = -ENOENT;
+		} else {
+			goto queue_it;
+		}
+	} else {
+		/*
+		 * Send all queued packets.
+		 */
+		if (_vtpm_send_queued(chip) == 0) {
+
+			vtpms->current_request = t;
+
+			rc = vtpm_vd_send(vtpms->tpm_private,
+			                  buf,
+			                  count,
+			                  t);
+			/*
+			 * The generic TPM driver will call
+			 * the function to receive the response.
+			 */
+			if (rc < 0) {
+				vtpms->current_request = NULL;
+				goto queue_it;
+			}
+		} else {
+queue_it:
+			if (!transmission_set_req_buffer(t, buf, count)) {
+				transmission_free(t);
+				rc = -ENOMEM;
+				goto exit;
+			}
+			/*
+			 * An error occurred. Don't event try
+			 * to send the current request. Just
+			 * queue it.
+			 */
+			spin_lock_irqsave(&vtpms->req_list_lock, flags);
+			vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY;
+			list_add_tail(&t->next, &vtpms->queued_requests);
+			spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+		}
+	}
+
+exit:
+	return rc;
+}
+
+
+/*
+ * Send all queued requests.
+ */
+static int _vtpm_send_queued(struct tpm_chip *chip)
+{
+	int rc;
+	int error = 0;
+	unsigned long flags;
+	unsigned char buffer[1];
+	struct vtpm_state *vtpms;
+	vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+	spin_lock_irqsave(&vtpms->req_list_lock, flags);
+
+	while (!list_empty(&vtpms->queued_requests)) {
+		/*
+		 * Need to dequeue them.
+		 * Read the result into a dummy buffer.
+		 */
+		struct transmission *qt = (struct transmission *)
+		                          vtpms->queued_requests.next;
+		list_del(&qt->next);
+		vtpms->current_request = qt;
+		spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+
+		rc = vtpm_vd_send(vtpms->tpm_private,
+		                  qt->request,
+		                  qt->request_len,
+		                  qt);
+
+		if (rc < 0) {
+			spin_lock_irqsave(&vtpms->req_list_lock, flags);
+			if ((qt = vtpms->current_request) != NULL) {
+				/*
+				 * requeue it at the beginning
+				 * of the list
+				 */
+				list_add(&qt->next,
+				         &vtpms->queued_requests);
+			}
+			vtpms->current_request = NULL;
+			error = 1;
+			break;
+		}
+		/*
+		 * After this point qt is not valid anymore!
+		 * It is freed when the front-end is delivering
+		 * the data by calling tpm_recv
+		 */
+		/*
+		 * Receive response into provided dummy buffer
+		 */
+		rc = vtpm_recv(chip, buffer, sizeof(buffer));
+		spin_lock_irqsave(&vtpms->req_list_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+
+	return error;
+}
+
+static void vtpm_cancel(struct tpm_chip *chip)
+{
+	unsigned long flags;
+	struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+	spin_lock_irqsave(&vtpms->resp_list_lock,flags);
+
+	if (!vtpms->current_response && vtpms->current_request) {
+		spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+		interruptible_sleep_on(&vtpms->resp_wait_queue);
+		spin_lock_irqsave(&vtpms->resp_list_lock,flags);
+	}
+
+	if (vtpms->current_response) {
+		struct transmission *t = vtpms->current_response;
+		vtpms->current_response = NULL;
+		transmission_free(t);
+	}
+
+	spin_unlock_irqrestore(&vtpms->resp_list_lock,flags);
+}
+
+static u8 vtpm_status(struct tpm_chip *chip)
+{
+	u8 rc = 0;
+	unsigned long flags;
+	struct vtpm_state *vtpms;
+
+	vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+	spin_lock_irqsave(&vtpms->resp_list_lock, flags);
+	/*
+	 * Data are available if:
+	 *  - there's a current response
+	 *  - the last packet was queued only (this is fake, but necessary to
+	 *      get the generic TPM layer to call the receive function.)
+	 */
+	if (vtpms->current_response ||
+	    0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) {
+		rc = STATUS_DATA_AVAIL;
+	} else if (!vtpms->current_response && !vtpms->current_request) {
+		rc = STATUS_READY;
+	}
+
+	spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+	return rc;
+}
+
+static struct file_operations vtpm_ops = {
+	.owner = THIS_MODULE,
+	.llseek = no_llseek,
+	.open = tpm_open,
+	.read = tpm_read,
+	.write = tpm_write,
+	.release = tpm_release,
+};
+
+static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
+static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
+static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
+static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
+static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
+static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
+		   NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
+static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
+
+static struct attribute *vtpm_attrs[] = {
+	&dev_attr_pubek.attr,
+	&dev_attr_pcrs.attr,
+	&dev_attr_enabled.attr,
+	&dev_attr_active.attr,
+	&dev_attr_owned.attr,
+	&dev_attr_temp_deactivated.attr,
+	&dev_attr_caps.attr,
+	&dev_attr_cancel.attr,
+	NULL,
+};
+
+static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs };
+
+#define TPM_LONG_TIMEOUT   (10 * 60 * HZ)
+
+static struct tpm_vendor_specific tpm_vtpm = {
+	.recv = vtpm_recv,
+	.send = vtpm_send,
+	.cancel = vtpm_cancel,
+	.status = vtpm_status,
+	.req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
+	.req_complete_val  = STATUS_DATA_AVAIL,
+	.req_canceled = STATUS_READY,
+	.attr_group = &vtpm_attr_grp,
+	.miscdev = {
+		.fops = &vtpm_ops,
+	},
+	.duration = {
+		TPM_LONG_TIMEOUT,
+		TPM_LONG_TIMEOUT,
+		TPM_LONG_TIMEOUT,
+	},
+};
+
+struct tpm_chip *init_vtpm(struct device *dev,
+                           struct tpm_private *tp)
+{
+	long rc;
+	struct tpm_chip *chip;
+	struct vtpm_state *vtpms;
+
+	vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL);
+	if (!vtpms)
+		return ERR_PTR(-ENOMEM);
+
+	vtpm_state_init(vtpms);
+	vtpms->tpm_private = tp;
+
+	chip = tpm_register_hardware(dev, &tpm_vtpm);
+	if (!chip) {
+		rc = -ENODEV;
+		goto err_free_mem;
+	}
+
+	chip_set_private(chip, vtpms);
+
+	return chip;
+
+err_free_mem:
+	kfree(vtpms);
+
+	return ERR_PTR(rc);
+}
+
+void cleanup_vtpm(struct device *dev)
+{
+	struct tpm_chip *chip = dev_get_drvdata(dev);
+	struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip);
+	tpm_remove_hardware(dev);
+	kfree(vtpms);
+}
diff --git a/drivers/char/tpm/tpm_vtpm.h b/drivers/char/tpm/tpm_vtpm.h
new file mode 100644
index 0000000..77aa342
--- /dev/null
+++ b/drivers/char/tpm/tpm_vtpm.h
@@ -0,0 +1,55 @@
+#ifndef TPM_VTPM_H
+#define TPM_VTPM_H
+
+struct tpm_chip;
+struct tpm_private;
+
+struct vtpm_state {
+	struct transmission *current_request;
+	spinlock_t           req_list_lock;
+	wait_queue_head_t    req_wait_queue;
+
+	struct list_head     queued_requests;
+
+	struct transmission *current_response;
+	spinlock_t           resp_list_lock;
+	wait_queue_head_t    resp_wait_queue;     // processes waiting for responses
+
+	u8                   vd_status;
+	u8                   flags;
+
+	unsigned long        disconnect_time;
+
+	/*
+	 * The following is a private structure of the underlying
+	 * driver. It is passed as parameter in the send function.
+	 */
+	struct tpm_private *tpm_private;
+};
+
+
+enum vdev_status {
+	TPM_VD_STATUS_DISCONNECTED = 0x0,
+	TPM_VD_STATUS_CONNECTED = 0x1
+};
+
+/* this function is called from tpm_vtpm.c */
+int vtpm_vd_send(struct tpm_private * tp,
+                 const u8 * buf, size_t count, void *ptr);
+
+/* these functions are offered by tpm_vtpm.c */
+struct tpm_chip *init_vtpm(struct device *,
+                           struct tpm_private *);
+void cleanup_vtpm(struct device *);
+int vtpm_vd_recv(const struct tpm_chip* chip,
+                 const unsigned char *buffer, size_t count, void *ptr);
+void vtpm_vd_status(const struct tpm_chip *, u8 status);
+
+static inline struct tpm_private *tpm_private_from_dev(struct device *dev)
+{
+	struct tpm_chip *chip = dev_get_drvdata(dev);
+	struct vtpm_state *vtpms = chip_get_private(chip);
+	return vtpms->tpm_private;
+}
+
+#endif
diff --git a/drivers/char/tpm/tpm_xen.c b/drivers/char/tpm/tpm_xen.c
new file mode 100644
index 0000000..32cee6d
--- /dev/null
+++ b/drivers/char/tpm/tpm_xen.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ * Grant table support: Mahadevan Gomathisankaran
+ *
+ * This code has been derived from drivers/xen/netfront/netfront.c
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <xen/evtchn.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/tpmif.h>
+#include <xen/gnttab.h>
+#include <xen/xenbus.h>
+#include "tpm.h"
+#include "tpm_vtpm.h"
+
+#undef DEBUG
+
+/* local structures */
+struct tpm_private {
+	struct tpm_chip *chip;
+
+	tpmif_tx_interface_t *tx;
+	atomic_t refcnt;
+	unsigned int irq;
+	u8 is_connected;
+	u8 is_suspended;
+
+	spinlock_t tx_lock;
+
+	struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
+
+	atomic_t tx_busy;
+	void *tx_remember;
+
+	domid_t backend_id;
+	wait_queue_head_t wait_q;
+
+	struct xenbus_device *dev;
+	int ring_ref;
+};
+
+struct tx_buffer {
+	unsigned int size;	// available space in data
+	unsigned int len;	// used space in data
+	unsigned char *data;	// pointer to a page
+};
+
+
+/* locally visible variables */
+static grant_ref_t gref_head;
+static struct tpm_private *my_priv;
+
+/* local function prototypes */
+static irqreturn_t tpmif_int(int irq,
+                             void *tpm_priv);
+static void tpmif_rx_action(unsigned long unused);
+static int tpmif_connect(struct xenbus_device *dev,
+                         struct tpm_private *tp,
+                         domid_t domid);
+static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
+static int tpmif_allocate_tx_buffers(struct tpm_private *tp);
+static void tpmif_free_tx_buffers(struct tpm_private *tp);
+static void tpmif_set_connected_state(struct tpm_private *tp,
+                                      u8 newstate);
+static int tpm_xmit(struct tpm_private *tp,
+                    const u8 * buf, size_t count, int userbuffer,
+                    void *remember);
+static void destroy_tpmring(struct tpm_private *tp);
+void __exit tpmif_exit(void);
+
+#define DPRINTK(fmt, args...) \
+    pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...) \
+    pr_info("xen_tpm_fr: " fmt, ##args)
+#define WPRINTK(fmt, args...) \
+    pr_warning("xen_tpm_fr: " fmt, ##args)
+
+#define GRANT_INVALID_REF	0
+
+
+static inline int
+tx_buffer_copy(struct tx_buffer *txb, const u8 *src, int len,
+               int isuserbuffer)
+{
+	int copied = len;
+
+	if (len > txb->size)
+		copied = txb->size;
+	if (isuserbuffer) {
+		if (copy_from_user(txb->data, src, copied))
+			return -EFAULT;
+	} else {
+		memcpy(txb->data, src, copied);
+	}
+	txb->len = len;
+	return copied;
+}
+
+static inline struct tx_buffer *tx_buffer_alloc(void)
+{
+	struct tx_buffer *txb;
+
+	txb = kzalloc(sizeof(struct tx_buffer), GFP_KERNEL);
+	if (!txb)
+		return NULL;
+
+	txb->len = 0;
+	txb->size = PAGE_SIZE;
+	txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (txb->data == NULL) {
+		kfree(txb);
+		txb = NULL;
+	}
+
+	return txb;
+}
+
+
+static inline void tx_buffer_free(struct tx_buffer *txb)
+{
+	if (txb) {
+		free_page((long)txb->data);
+		kfree(txb);
+	}
+}
+
+/**************************************************************
+ Utility function for the tpm_private structure
+**************************************************************/
+static void tpm_private_init(struct tpm_private *tp)
+{
+	spin_lock_init(&tp->tx_lock);
+	init_waitqueue_head(&tp->wait_q);
+	atomic_set(&tp->refcnt, 1);
+}
+
+static void tpm_private_put(void)
+{
+	if (!atomic_dec_and_test(&my_priv->refcnt))
+		return;
+
+	tpmif_free_tx_buffers(my_priv);
+	kfree(my_priv);
+	my_priv = NULL;
+}
+
+static struct tpm_private *tpm_private_get(void)
+{
+	int err;
+
+	if (my_priv) {
+		atomic_inc(&my_priv->refcnt);
+		return my_priv;
+	}
+
+	my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
+	if (!my_priv)
+		return NULL;
+
+	tpm_private_init(my_priv);
+	err = tpmif_allocate_tx_buffers(my_priv);
+	if (err < 0)
+		tpm_private_put();
+
+	return my_priv;
+}
+
+/**************************************************************
+
+ The interface to let the tpm plugin register its callback
+ function and send data to another partition using this module
+
+**************************************************************/
+
+static DEFINE_MUTEX(suspend_lock);
+/*
+ * Send data via this module by calling this function
+ */
+int vtpm_vd_send(struct tpm_private *tp,
+                 const u8 * buf, size_t count, void *ptr)
+{
+	int sent;
+
+	mutex_lock(&suspend_lock);
+	sent = tpm_xmit(tp, buf, count, 0, ptr);
+	mutex_unlock(&suspend_lock);
+
+	return sent;
+}
+
+/**************************************************************
+ XENBUS support code
+**************************************************************/
+
+static int setup_tpmring(struct xenbus_device *dev,
+                         struct tpm_private *tp)
+{
+	tpmif_tx_interface_t *sring;
+	int err;
+
+	tp->ring_ref = GRANT_INVALID_REF;
+
+	sring = (void *)__get_free_page(GFP_KERNEL);
+	if (!sring) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+		return -ENOMEM;
+	}
+	tp->tx = sring;
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
+	if (err < 0) {
+		free_page((unsigned long)sring);
+		tp->tx = NULL;
+		xenbus_dev_fatal(dev, err, "allocating grant reference");
+		goto fail;
+	}
+	tp->ring_ref = err;
+
+	err = tpmif_connect(dev, tp, dev->otherend_id);
+	if (err)
+		goto fail;
+
+	return 0;
+fail:
+	destroy_tpmring(tp);
+	return err;
+}
+
+
+static void destroy_tpmring(struct tpm_private *tp)
+{
+	tpmif_set_connected_state(tp, 0);
+
+	if (tp->ring_ref != GRANT_INVALID_REF) {
+		gnttab_end_foreign_access(tp->ring_ref, (unsigned long)tp->tx);
+		tp->ring_ref = GRANT_INVALID_REF;
+		tp->tx = NULL;
+	}
+
+	if (tp->irq)
+		unbind_from_irqhandler(tp->irq, tp);
+
+	tp->irq = 0;
+}
+
+
+static int talk_to_backend(struct xenbus_device *dev,
+                           struct tpm_private *tp)
+{
+	const char *message = NULL;
+	int err;
+	struct xenbus_transaction xbt;
+
+	err = setup_tpmring(dev, tp);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "setting up ring");
+		goto out;
+	}
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_tpmring;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename,
+	                    "ring-ref","%u", tp->ring_ref);
+	if (err) {
+		message = "writing ring-ref";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+			    irq_to_evtchn_port(tp->irq));
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err) {
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_tpmring;
+	}
+
+	xenbus_switch_state(dev, XenbusStateConnected);
+
+	return 0;
+
+abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	if (message)
+		xenbus_dev_error(dev, err, "%s", message);
+destroy_tpmring:
+	destroy_tpmring(tp);
+out:
+	return err;
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+			    enum xenbus_state backend_state)
+{
+	struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+	DPRINTK("\n");
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+	case XenbusStateUnknown:
+		break;
+
+	case XenbusStateConnected:
+		tpmif_set_connected_state(tp, 1);
+		break;
+
+	case XenbusStateClosing:
+		tpmif_set_connected_state(tp, 0);
+		xenbus_frontend_closed(dev);
+		break;
+
+	case XenbusStateClosed:
+		tpmif_set_connected_state(tp, 0);
+		if (tp->is_suspended == 0)
+			device_unregister(&dev->dev);
+		xenbus_frontend_closed(dev);
+		break;
+	}
+}
+
+static int tpmfront_probe(struct xenbus_device *dev,
+                          const struct xenbus_device_id *id)
+{
+	int err;
+	int handle;
+	struct tpm_private *tp = tpm_private_get();
+
+	if (!tp)
+		return -ENOMEM;
+
+	tp->chip = init_vtpm(&dev->dev, tp);
+	if (IS_ERR(tp->chip))
+		return PTR_ERR(tp->chip);
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename,
+	                   "handle", "%i", &handle);
+	if (XENBUS_EXIST_ERR(err))
+		return err;
+
+	if (err < 0) {
+		xenbus_dev_fatal(dev,err,"reading virtual-device");
+		return err;
+	}
+
+	tp->dev = dev;
+
+	err = talk_to_backend(dev, tp);
+	if (err) {
+		tpm_private_put();
+		return err;
+	}
+
+	return 0;
+}
+
+
+static int tpmfront_remove(struct xenbus_device *dev)
+{
+	struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+	destroy_tpmring(tp);
+	cleanup_vtpm(&dev->dev);
+	return 0;
+}
+
+static int tpmfront_suspend(struct xenbus_device *dev)
+{
+	struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+	u32 ctr;
+
+	/* Take the lock, preventing any application from sending. */
+	mutex_lock(&suspend_lock);
+	tp->is_suspended = 1;
+
+	for (ctr = 0; atomic_read(&tp->tx_busy); ctr++) {
+		if ((ctr % 10) == 0)
+			printk("TPM-FE [INFO]: Waiting for outstanding "
+			       "request.\n");
+		/* Wait for a request to be responded to. */
+		interruptible_sleep_on_timeout(&tp->wait_q, 100);
+	}
+
+	return 0;
+}
+
+static int tpmfront_suspend_finish(struct tpm_private *tp)
+{
+	tp->is_suspended = 0;
+	/* Allow applications to send again. */
+	mutex_unlock(&suspend_lock);
+	return 0;
+}
+
+static int tpmfront_suspend_cancel(struct xenbus_device *dev)
+{
+	struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+	return tpmfront_suspend_finish(tp);
+}
+
+static int tpmfront_resume(struct xenbus_device *dev)
+{
+	struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+	destroy_tpmring(tp);
+	return talk_to_backend(dev, tp);
+}
+
+static int tpmif_connect(struct xenbus_device *dev,
+                         struct tpm_private *tp,
+                         domid_t domid)
+{
+	int err;
+
+	tp->backend_id = domid;
+
+	err = bind_listening_port_to_irqhandler(
+		domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp);
+	if (err <= 0) {
+		WPRINTK("bind_listening_port_to_irqhandler failed "
+			"(err=%d)\n", err);
+		return err;
+	}
+	tp->irq = err;
+
+	return 0;
+}
+
+static const struct xenbus_device_id tpmfront_ids[] = {
+	{ "vtpm" },
+	{ "" }
+};
+MODULE_ALIAS("xen:vtpm");
+
+static DEFINE_XENBUS_DRIVER(tpmfront, ,
+	.probe = tpmfront_probe,
+	.remove =  tpmfront_remove,
+	.resume = tpmfront_resume,
+	.otherend_changed = backend_changed,
+	.suspend = tpmfront_suspend,
+	.suspend_cancel = tpmfront_suspend_cancel,
+);
+
+static int __init init_tpm_xenbus(void)
+{
+	return xenbus_register_frontend(&tpmfront_driver);
+}
+
+static int tpmif_allocate_tx_buffers(struct tpm_private *tp)
+{
+	unsigned int i;
+
+	for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
+		tp->tx_buffers[i] = tx_buffer_alloc();
+		if (!tp->tx_buffers[i]) {
+			tpmif_free_tx_buffers(tp);
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+static void tpmif_free_tx_buffers(struct tpm_private *tp)
+{
+	unsigned int i;
+
+	for (i = 0; i < TPMIF_TX_RING_SIZE; i++)
+		tx_buffer_free(tp->tx_buffers[i]);
+}
+
+static void tpmif_rx_action(unsigned long priv)
+{
+	struct tpm_private *tp = (struct tpm_private *)priv;
+	int i = 0;
+	unsigned int received;
+	unsigned int offset = 0;
+	u8 *buffer;
+	tpmif_tx_request_t *tx = &tp->tx->ring[i].req;
+
+	atomic_set(&tp->tx_busy, 0);
+	wake_up_interruptible(&tp->wait_q);
+
+	received = tx->size;
+
+	buffer = kmalloc(received, GFP_ATOMIC);
+	if (!buffer)
+		return;
+
+	for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
+		struct tx_buffer *txb = tp->tx_buffers[i];
+		tpmif_tx_request_t *tx;
+		unsigned int tocopy;
+
+		tx = &tp->tx->ring[i].req;
+		tocopy = tx->size;
+		if (tocopy > PAGE_SIZE)
+			tocopy = PAGE_SIZE;
+
+		memcpy(&buffer[offset], txb->data, tocopy);
+
+		gnttab_release_grant_reference(&gref_head, tx->ref);
+
+		offset += tocopy;
+	}
+
+	vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember);
+	kfree(buffer);
+}
+
+
+static irqreturn_t tpmif_int(int irq, void *tpm_priv)
+{
+	struct tpm_private *tp = tpm_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tp->tx_lock, flags);
+	tpmif_rx_tasklet.data = (unsigned long)tp;
+	tasklet_schedule(&tpmif_rx_tasklet);
+	spin_unlock_irqrestore(&tp->tx_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+
+static int tpm_xmit(struct tpm_private *tp,
+                    const u8 * buf, size_t count, int isuserbuffer,
+                    void *remember)
+{
+	tpmif_tx_request_t *tx;
+	TPMIF_RING_IDX i;
+	unsigned int offset = 0;
+
+	spin_lock_irq(&tp->tx_lock);
+
+	if (unlikely(atomic_read(&tp->tx_busy))) {
+		printk("tpm_xmit: There's an outstanding request/response "
+		       "on the way!\n");
+		spin_unlock_irq(&tp->tx_lock);
+		return -EBUSY;
+	}
+
+	if (tp->is_connected != 1) {
+		spin_unlock_irq(&tp->tx_lock);
+		return -EIO;
+	}
+
+	for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
+		struct tx_buffer *txb = tp->tx_buffers[i];
+		int copied;
+
+		if (!txb) {
+			DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
+				"Not transmitting anything!\n", i);
+			spin_unlock_irq(&tp->tx_lock);
+			return -EFAULT;
+		}
+
+		copied = tx_buffer_copy(txb, &buf[offset], count,
+		                        isuserbuffer);
+		if (copied < 0) {
+			/* An error occurred */
+			spin_unlock_irq(&tp->tx_lock);
+			return copied;
+		}
+		count -= copied;
+		offset += copied;
+
+		tx = &tp->tx->ring[i].req;
+		tx->addr = virt_to_machine(txb->data);
+		tx->size = txb->len;
+		tx->unused = 0;
+
+		DPRINTK("First 4 characters sent by TPM-FE are "
+			"0x%02x 0x%02x 0x%02x 0x%02x\n",
+		        txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
+
+		/* Get the granttable reference for this page. */
+		tx->ref = gnttab_claim_grant_reference(&gref_head);
+		if (tx->ref == -ENOSPC) {
+			spin_unlock_irq(&tp->tx_lock);
+			DPRINTK("Grant table claim reference failed in "
+				"func:%s line:%d file:%s\n",
+				__FUNCTION__, __LINE__, __FILE__);
+			return -ENOSPC;
+		}
+		gnttab_grant_foreign_access_ref(tx->ref,
+						tp->backend_id,
+						virt_to_mfn(txb->data),
+						0 /*RW*/);
+		wmb();
+	}
+
+	atomic_set(&tp->tx_busy, 1);
+	tp->tx_remember = remember;
+
+	mb();
+
+	notify_remote_via_irq(tp->irq);
+
+	spin_unlock_irq(&tp->tx_lock);
+	return offset;
+}
+
+
+static void tpmif_notify_upperlayer(struct tpm_private *tp)
+{
+	/* Notify upper layer about the state of the connection to the BE. */
+	vtpm_vd_status(tp->chip, (tp->is_connected
+				  ? TPM_VD_STATUS_CONNECTED
+				  : TPM_VD_STATUS_DISCONNECTED));
+}
+
+
+static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
+{
+	/*
+	 * Don't notify upper layer if we are in suspend mode and
+	 * should disconnect - assumption is that we will resume
+	 * The mutex keeps apps from sending.
+	 */
+	if (is_connected == 0 && tp->is_suspended == 1)
+		return;
+
+	/*
+	 * Unlock the mutex if we are connected again
+	 * after being suspended - now resuming.
+	 * This also removes the suspend state.
+	 */
+	if (is_connected == 1 && tp->is_suspended == 1)
+		tpmfront_suspend_finish(tp);
+
+	if (is_connected != tp->is_connected) {
+		tp->is_connected = is_connected;
+		tpmif_notify_upperlayer(tp);
+	}
+}
+
+
+
+/* =================================================================
+ * Initialization function.
+ * =================================================================
+ */
+
+
+static int __init tpmif_init(void)
+{
+	struct tpm_private *tp;
+
+	if (is_initial_xendomain())
+		return -EPERM;
+
+	tp = tpm_private_get();
+	if (!tp)
+		return -ENOMEM;
+
+	IPRINTK("Initialising the vTPM driver.\n");
+	if (gnttab_alloc_grant_references(TPMIF_TX_RING_SIZE,
+					  &gref_head) < 0) {
+		tpm_private_put();
+		return -EFAULT;
+	}
+
+	init_tpm_xenbus();
+	return 0;
+}
+
+
+module_init(tpmif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index e24a2a1..530f9b6 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -2,6 +2,7 @@ menu "CPU Frequency scaling"
 
 config CPU_FREQ
 	bool "CPU Frequency scaling"
+	depends on !PROCESSOR_EXTERNAL_CONTROL
 	help
 	  CPU Frequency scaling allows you to change the clock speed of 
 	  CPUs on the fly. This is a nice method to save power, because 
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 78a666d..d530ccb 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -1,6 +1,7 @@
 
 config CPU_IDLE
 	bool "CPU idle PM support"
+	depends on !PROCESSOR_EXTERNAL_CONTROL
 	default y if ACPI || PPC_PSERIES
 	help
 	  CPU idle is a generic framework for supporting software-controlled
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index f1a2749..d0809a9 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -61,7 +61,7 @@ config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86
 	select DMA_ENGINE
-	select DCA
+	select DCA if !XEN
 	select ASYNC_TX_DISABLE_PQ_VAL_DMA
 	select ASYNC_TX_DISABLE_XOR_VAL_DMA
 	help
diff --git a/drivers/dma/ioat/Makefile b/drivers/dma/ioat/Makefile
index 0ff7270..495983a 100644
--- a/drivers/dma/ioat/Makefile
+++ b/drivers/dma/ioat/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
-ioatdma-y := pci.o dma.o dma_v2.o dma_v3.o dca.o
+dca-$(CONFIG_DCA) := dca.o
+ioatdma-y := pci.o dma.o dma_v2.o dma_v3.o $(dca-y) $(dca-m)
diff --git a/drivers/dma/ioat/dca.c b/drivers/dma/ioat/dca.c
index abd9038..fb188d1 100644
--- a/drivers/dma/ioat/dca.c
+++ b/drivers/dma/ioat/dca.c
@@ -682,3 +682,15 @@ ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase)
 
 	return dca;
 }
+
+void ioat_remove_dca_provider(struct pci_dev *pdev)
+{
+	struct ioatdma_device *device = pci_get_drvdata(pdev);
+
+	if (!device->dca)
+		return;
+
+	unregister_dca_provider(device->dca, &pdev->dev);
+	free_dca_provider(device->dca);
+	device->dca = NULL;
+}
diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h
index 5216c8a..6a8f223 100644
--- a/drivers/dma/ioat/dma.h
+++ b/drivers/dma/ioat/dma.h
@@ -347,4 +347,21 @@ void ioat_kobject_del(struct ioatdma_device *device);
 extern const struct sysfs_ops ioat_sysfs_ops;
 extern struct ioat_sysfs_entry ioat_version_attr;
 extern struct ioat_sysfs_entry ioat_cap_attr;
+
+#ifndef CONFIG_XEN
+void ioat_remove_dca_provider(struct pci_dev *);
+#else
+static inline void ioat_remove_dca_provider(struct pci_dev *pdev)
+{
+	struct ioatdma_device *device = pci_get_drvdata(pdev);
+	BUG_ON(device->dca);
+}
+static inline struct dca_provider *__devinit
+__ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase)
+{
+	return NULL;
+}
+#define ioat_dca_init __ioat_dca_init
+#endif
+
 #endif /* IOATDMA_H */
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h
index a2c413b..1a61a42 100644
--- a/drivers/dma/ioat/dma_v2.h
+++ b/drivers/dma/ioat/dma_v2.h
@@ -176,4 +176,10 @@ int ioat2_quiesce(struct ioat_chan_common *chan, unsigned long tmo);
 int ioat2_reset_sync(struct ioat_chan_common *chan, unsigned long tmo);
 extern struct kobj_type ioat2_ktype;
 extern struct kmem_cache *ioat2_cache;
+
+#ifdef CONFIG_XEN
+#define ioat2_dca_init __ioat_dca_init
+#define ioat3_dca_init __ioat_dca_init
+#endif
+
 #endif /* IOATDMA_V2_H */
diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h
index 60e6754..3d0dca6 100644
--- a/drivers/dma/ioat/hw.h
+++ b/drivers/dma/ioat/hw.h
@@ -39,7 +39,11 @@
 #define IOAT_VER_3_0            0x30    /* Version 3.0 */
 #define IOAT_VER_3_2            0x32    /* Version 3.2 */
 
+#ifndef CONFIG_XEN
 int system_has_dca_enabled(struct pci_dev *pdev);
+#else
+static inline int system_has_dca_enabled(struct pci_dev *pdev) { return 0; }
+#endif
 
 struct ioat_dma_descriptor {
 	uint32_t	size;
diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c
index 5e3a40f..efec888 100644
--- a/drivers/dma/ioat/pci.c
+++ b/drivers/dma/ioat/pci.c
@@ -29,7 +29,6 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
-#include <linux/dca.h>
 #include <linux/slab.h>
 #include "dma.h"
 #include "dma_v2.h"
@@ -181,11 +180,7 @@ static void __devexit ioat_remove(struct pci_dev *pdev)
 		return;
 
 	dev_err(&pdev->dev, "Removing dma and dca services\n");
-	if (device->dca) {
-		unregister_dca_provider(device->dca, &pdev->dev);
-		free_dca_provider(device->dca);
-		device->dca = NULL;
-	}
+	ioat_remove_dca_provider(pdev);
 	ioat_dma_remove(device);
 }
 
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 5948a21..2bee917 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -41,7 +41,7 @@ config EDAC_DEBUG
 
 config EDAC_DECODE_MCE
 	tristate "Decode MCEs in human-readable form (only on AMD for now)"
-	depends on CPU_SUP_AMD && X86_MCE_AMD
+	depends on CPU_SUP_AMD && (X86_MCE_AMD || X86_XEN_MCE)
 	default y
 	---help---
 	  Enable this option if you want to decode Machine Check Exceptions
@@ -74,6 +74,7 @@ config EDAC_MM_EDAC
 config EDAC_AMD64
 	tristate "AMD64 (Opteron, Athlon64) K8, F10h"
 	depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE
+	depends on !XEN
 	help
 	  Support for error detection and correction of DRAM ECC errors on
 	  the AMD64 families of memory controllers (K8 and F10h)
@@ -170,7 +171,7 @@ config EDAC_I5400
 
 config EDAC_I7CORE
 	tristate "Intel i7 Core (Nehalem) processors"
-	depends on EDAC_MM_EDAC && PCI && X86 && X86_MCE_INTEL
+	depends on EDAC_MM_EDAC && PCI && X86 && (X86_MCE_INTEL || X86_XEN_MCE)
 	help
 	  Support for error detection and correction the Intel
 	  i7 Core (Nehalem) Integrated Memory Controller that exists on
@@ -214,7 +215,7 @@ config EDAC_I7300
 
 config EDAC_SBRIDGE
 	tristate "Intel Sandy-Bridge Integrated MC"
-	depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL
+	depends on EDAC_MM_EDAC && PCI && X86_64 && (X86_MCE_INTEL || X86_XEN_MCE)
 	depends on EXPERIMENTAL
 	help
 	  Support for error detection and correction the Intel
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index ca6c04d..e614f0a 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -610,6 +610,10 @@ static void edac_mc_scrub_block(unsigned long page, unsigned long offset,
 
 	debugf3("%s()\n", __func__);
 
+#ifdef CONFIG_XEN
+	page = mfn_to_local_pfn(page);
+#endif
+
 	/* ECC error page was not in our memory. Ignore it. */
 	if (!pfn_valid(page))
 		return;
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 8568d9b..7d64e19 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -1932,7 +1932,7 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val,
 	if (mce->bank != 8)
 		return NOTIFY_DONE;
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_XEN)
 	/* Only handle if it is the right mc controller */
 	if (mce->socketid != pvt->i7core_dev->socket)
 		return NOTIFY_DONE;
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 1dc118d..54407f2 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1610,7 +1610,11 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
 		mce->socketid, mce->apicid);
 
 	/* Only handle if it is the right mc controller */
+#ifdef CONFIG_XEN /* Could easily be used for non-Xen too. */
+	if (mce->socketid != pvt->sbridge_dev->mc)
+#else
 	if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc)
+#endif
 		return NOTIFY_DONE;
 
 	smp_rmb();
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 9b00072..1d24c4d 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -91,6 +91,7 @@ config DELL_RBU
 config DCDBAS
 	tristate "Dell Systems Management Base Driver"
 	depends on X86
+	select XEN_DOMCTL if XEN
 	help
 	  The Dell Systems Management Base Driver provides a sysfs interface
 	  for systems management software to perform System Management
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index ea5ac2d..156a75d 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -37,6 +37,10 @@
 #include <linux/mutex.h>
 #include <asm/io.h>
 
+#ifdef CONFIG_XEN
+#include "../xen/core/domctl.h"
+#endif
+
 #include "dcdbas.h"
 
 #define DRIVER_NAME		"dcdbas"
@@ -107,7 +111,7 @@ static int smi_data_buf_realloc(unsigned long size)
 	/* set up new buffer for use */
 	smi_data_buf = buf;
 	smi_data_buf_handle = handle;
-	smi_data_buf_phys_addr = (u32) virt_to_phys(buf);
+	smi_data_buf_phys_addr = (u32) handle;
 	smi_data_buf_size = size;
 
 	dev_dbg(&dcdbas_pdev->dev, "%s: phys: %x size: %lu\n",
@@ -245,7 +249,9 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
  */
 int dcdbas_smi_request(struct smi_cmd *smi_cmd)
 {
+#ifndef CONFIG_XEN
 	cpumask_var_t old_mask;
+#endif
 	int ret = 0;
 
 	if (smi_cmd->magic != SMI_CMD_MAGIC) {
@@ -255,6 +261,7 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
 	}
 
 	/* SMI requires CPU 0 */
+#ifndef CONFIG_XEN
 	if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
 		return -ENOMEM;
 
@@ -266,6 +273,14 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
 		ret = -EBUSY;
 		goto out;
 	}
+#else
+	ret = xen_set_physical_cpu_affinity(0);
+	if (ret) {
+		dev_dbg(&dcdbas_pdev->dev, "%s: failed (%d) to get CPU 0\n",
+			__func__, ret);
+		return ret;
+	}
+#endif
 
 	/* generate SMI */
 	/* inb to force posted write through and make SMI happen now */
@@ -280,9 +295,13 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
 		: "memory"
 	);
 
+#ifndef CONFIG_XEN
 out:
 	set_cpus_allowed_ptr(current, old_mask);
 	free_cpumask_var(old_mask);
+#else
+	xen_set_physical_cpu_affinity(-1);
+#endif
 	return ret;
 }
 
@@ -322,7 +341,7 @@ static ssize_t smi_request_store(struct device *dev,
 		break;
 	case 1:
 		/* Calling Interface SMI */
-		smi_cmd->ebx = (u32) virt_to_phys(smi_cmd->command_buffer);
+		smi_cmd->ebx = (u32) virt_to_bus(smi_cmd->command_buffer);
 		ret = dcdbas_smi_request(smi_cmd);
 		if (!ret)
 			ret = count;
@@ -603,6 +622,11 @@ static int __init dcdbas_init(void)
 {
 	int error;
 
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return -ENODEV;
+#endif
+
 	error = platform_driver_register(&dcdbas_driver);
 	if (error)
 		return error;
diff --git a/drivers/firmware/dell_rbu.c b/drivers/firmware/dell_rbu.c
index 2f452f1..fb60177 100644
--- a/drivers/firmware/dell_rbu.c
+++ b/drivers/firmware/dell_rbu.c
@@ -170,9 +170,27 @@ static int create_packet(void *data, size_t length)
 			spin_lock(&rbu_data.lock);
 			goto out_alloc_packet_array;
 		}
+#ifdef CONFIG_XEN
+		if (ordernum && xen_create_contiguous_region(
+			(unsigned long)packet_data_temp_buf, ordernum, 0)) {
+			free_pages((unsigned long)packet_data_temp_buf,
+				   ordernum);
+			pr_warning("dell_rbu:%s: failed to adjust new "
+				   "packet\n", __func__);
+			retval = -ENOMEM;
+			spin_lock(&rbu_data.lock);
+			goto out_alloc_packet_array;
+		}
+#endif
 
-		if ((unsigned long)virt_to_phys(packet_data_temp_buf)
+		if ((unsigned long)virt_to_bus(packet_data_temp_buf)
 				< allocation_floor) {
+#ifdef CONFIG_XEN
+			if (ordernum)
+				xen_destroy_contiguous_region(
+					(unsigned long)packet_data_temp_buf,
+					ordernum);
+#endif
 			pr_debug("packet 0x%lx below floor at 0x%lx.\n",
 					(unsigned long)virt_to_phys(
 						packet_data_temp_buf),
@@ -186,7 +204,7 @@ static int create_packet(void *data, size_t length)
 	newpacket->data = packet_data_temp_buf;
 
 	pr_debug("create_packet: newpacket at physical addr %lx\n",
-		(unsigned long)virt_to_phys(newpacket->data));
+		(unsigned long)virt_to_bus(newpacket->data));
 
 	/* packets may not have fixed size */
 	newpacket->length = length;
@@ -205,7 +223,7 @@ out_alloc_packet_array:
 	/* always free packet array */
 	for (;idx>0;idx--) {
 		pr_debug("freeing unused packet below floor 0x%lx.\n",
-			(unsigned long)virt_to_phys(
+			(unsigned long)virt_to_bus(
 				invalid_addr_packet_array[idx-1]));
 		free_pages((unsigned long)invalid_addr_packet_array[idx-1],
 			ordernum);
@@ -349,6 +367,13 @@ static void packet_empty_list(void)
 		 * to make sure there are no stale RBU packets left in memory
 		 */
 		memset(newpacket->data, 0, rbu_data.packetsize);
+#ifdef CONFIG_XEN
+		if (newpacket->ordernum)
+			xen_destroy_contiguous_region(
+				(unsigned long)newpacket->data,
+				newpacket->ordernum);
+#endif
+
 		free_pages((unsigned long) newpacket->data,
 			newpacket->ordernum);
 		kfree(newpacket);
@@ -403,7 +428,9 @@ static int img_update_realloc(unsigned long size)
 {
 	unsigned char *image_update_buffer = NULL;
 	unsigned long rc;
+#ifndef CONFIG_XEN
 	unsigned long img_buf_phys_addr;
+#endif
 	int ordernum;
 	int dma_alloc = 0;
 
@@ -434,15 +461,19 @@ static int img_update_realloc(unsigned long size)
 
 	spin_unlock(&rbu_data.lock);
 
+#ifndef CONFIG_XEN
 	ordernum = get_order(size);
 	image_update_buffer =
 		(unsigned char *) __get_free_pages(GFP_KERNEL, ordernum);
 
 	img_buf_phys_addr =
-		(unsigned long) virt_to_phys(image_update_buffer);
+		(unsigned long) virt_to_bus(image_update_buffer);
 
 	if (img_buf_phys_addr > BIOS_SCAN_LIMIT) {
 		free_pages((unsigned long) image_update_buffer, ordernum);
+#else
+	{
+#endif
 		ordernum = -1;
 		image_update_buffer = dma_alloc_coherent(NULL, size,
 			&dell_rbu_dmaaddr, GFP_KERNEL);
@@ -695,6 +726,12 @@ static struct bin_attribute rbu_packet_size_attr = {
 static int __init dcdrbu_init(void)
 {
 	int rc;
+
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return -ENODEV;
+#endif
+
 	spin_lock_init(&rbu_data.lock);
 
 	init_packet_head();
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 153980b..ad46ec8 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -482,6 +482,11 @@ static bool dmi_matches(const struct dmi_system_id *dmi)
 {
 	int i;
 
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return false;
+#endif
+
 	WARN(!dmi_initialized, KERN_ERR "dmi check: not initialized yet.\n");
 
 	for (i = 0; i < ARRAY_SIZE(dmi->matches); i++) {
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 308f819..86cd274 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -842,7 +842,7 @@ static const struct file_operations i915_driver_fops = {
 	.open = drm_open,
 	.release = drm_release,
 	.unlocked_ioctl = drm_ioctl,
-	.mmap = drm_gem_mmap,
+	.mmap = i915_gem_mmap,
 	.poll = drm_poll,
 	.fasync = drm_fasync,
 	.read = drm_read,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9689ca3..189a269 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1201,6 +1201,11 @@ int __must_check i915_add_request(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request);
 int __must_check i915_wait_request(struct intel_ring_buffer *ring,
 				   uint32_t seqno);
+#ifdef CONFIG_XEN
+int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma);
+#else
+#define i915_gem_mmap drm_gem_mmap
+#endif
 int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 int __must_check
 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e55badb..a27cacf 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1172,6 +1172,17 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
 	return 0;
 }
 
+#ifdef CONFIG_XEN
+int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int ret = drm_gem_mmap(filp, vma);
+
+	pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
+
+	return ret;
+}
+#endif
+
 /**
  * i915_gem_fault - fault a page into the GTT
  * vma: VMA in question
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index f425b23..38580e7 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -8283,7 +8283,11 @@ void gen6_update_ring_freq(struct drm_i915_private *dev_priv)
 	 * over
 	 */
 	if (!max_ia_freq)
+#ifndef CONFIG_XEN
 		max_ia_freq = tsc_khz;
+#else
+		max_ia_freq = cpu_khz;
+#endif
 
 	/* Convert from kHz to MHz */
 	max_ia_freq /= 1000;
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 49f7cb7..280ee00 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -444,6 +444,18 @@ int radeon_dummy_page_init(struct radeon_device *rdev)
 	rdev->dummy_page.page = alloc_page(GFP_DMA32 | GFP_KERNEL | __GFP_ZERO);
 	if (rdev->dummy_page.page == NULL)
 		return -ENOMEM;
+#ifdef CONFIG_XEN
+	{
+		int ret = xen_limit_pages_to_max_mfn(rdev->dummy_page.page,
+						     0, 32);
+
+		if (!ret)
+			clear_page(page_address(rdev->dummy_page.page));
+		else
+			dev_warn(rdev->dev,
+				 "Error restricting dummy page: %d\n", ret);
+	}
+#endif
 	rdev->dummy_page.addr = pci_map_page(rdev->pdev, rdev->dummy_page.page,
 					0, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
 	if (pci_dma_mapping_error(rdev->pdev, rdev->dummy_page.addr)) {
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 7c3a57d..9a44e19 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1475,6 +1475,14 @@ int ttm_bo_global_init(struct drm_global_reference *ref)
 		ret = -ENOMEM;
 		goto out_no_drp;
 	}
+#ifdef CONFIG_XEN
+	ret = xen_limit_pages_to_max_mfn(glob->dummy_read_page, 0, 32);
+	if (!ret)
+		clear_page(page_address(glob->dummy_read_page));
+	else
+		printk(KERN_WARNING
+		       "Error restricting dummy read page: %d\n", ret);
+#endif
 
 	INIT_LIST_HEAD(&glob->swap_lru);
 	INIT_LIST_HEAD(&glob->device_list);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 5441284..9bda48f 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -169,7 +169,13 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (bo->mem.bus.is_iomem) {
 		vma->vm_page_prot = ttm_io_prot(bo->mem.placement,
 						vma->vm_page_prot);
+#if defined(CONFIG_XEN) && defined(_PAGE_IOMAP)
+		pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
+#endif
 	} else {
+#if defined(CONFIG_XEN) && defined(_PAGE_IOMAP)
+		pgprot_val(vma->vm_page_prot) &= ~_PAGE_IOMAP;
+#endif
 		ttm = bo->ttm;
 		vma->vm_page_prot = (bo->mem.placement & TTM_PL_FLAG_CACHED) ?
 		    vm_get_page_prot(vma->vm_flags) :
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 499debd..5346508 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -500,6 +500,19 @@ static int ttm_alloc_new_pages(struct list_head *pages, gfp_t gfp_flags,
 	for (i = 0, cpages = 0; i < count; ++i) {
 		p = alloc_page(gfp_flags);
 
+#ifdef CONFIG_XEN
+		if (p && (gfp_flags & __GFP_DMA32)) {
+			r = xen_limit_pages_to_max_mfn(p, 0, 32);
+			if (r) {
+				__free_page(p);
+				printk(KERN_ERR TTM_PFX
+				       "Cannot restrict page (%d).", r);
+				p = NULL;
+			} else if (gfp_flags & __GFP_ZERO)
+				clear_page(page_address(p));
+		}
+#endif
+
 		if (!p) {
 			printk(KERN_ERR TTM_PFX "Unable to get page %u.\n", i);
 
@@ -745,6 +758,22 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
 				return -ENOMEM;
 			}
 
+#ifdef CONFIG_XEN
+			if (flags & TTM_PAGE_FLAG_DMA32) {
+				int rc = xen_limit_pages_to_max_mfn(p, 0, 32);
+
+				if (rc) {
+					__free_page(p);
+					printk(KERN_ERR TTM_PFX
+					       "Unable to restrict page (%d).",
+					       rc);
+					return rc;
+				}
+				if (flags & TTM_PAGE_FLAG_ZERO_ALLOC)
+					clear_page(page_address(p));
+			}
+#endif
+
 			pages[r] = p;
 		}
 		return 0;
diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig
index 794ff67..6b153a3 100644
--- a/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@ -1,6 +1,6 @@
 config DRM_VMWGFX
 	tristate "DRM driver for VMware Virtual GPU"
-	depends on DRM && PCI && FB
+	depends on DRM && PCI && FB && !XEN
 	select FB_DEFERRED_IO
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 70f5dde..347479f 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -2,7 +2,7 @@ menu "Microsoft Hyper-V guest support"
 
 config HYPERV
 	tristate "Microsoft Hyper-V client drivers"
-	depends on X86 && ACPI && PCI
+	depends on X86 && ACPI && PCI && !XEN
 	help
 	  Select this option to run Linux as a Hyper-V client operating
 	  system.
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 0226040..22e8220 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -434,7 +434,8 @@ config SENSORS_GPIO_FAN
 
 config SENSORS_CORETEMP
 	tristate "Intel Core/Core2/Atom temperature sensor"
-	depends on X86 && PCI && EXPERIMENTAL
+	depends on X86 && PCI && !XEN_UNPRIVILEGED_GUEST && EXPERIMENTAL
+	select XEN_DOMCTL if XEN
 	help
 	  If you say yes here you get support for the temperature
 	  sensor inside your CPU. Most of the family 6 CPUs
@@ -1139,8 +1140,9 @@ config SENSORS_TWL4030_MADC
 
 config SENSORS_VIA_CPUTEMP
 	tristate "VIA CPU temperature sensor"
-	depends on X86
+	depends on X86 && !XEN_UNPRIVILEGED_GUEST
 	select HWMON_VID
+	select XEN_DOMCTL if XEN
 	help
 	  If you say yes here you get support for the temperature
 	  sensor inside your CPU. Supported are all known variants of
diff --git a/drivers/hwmon/coretemp-xen.c b/drivers/hwmon/coretemp-xen.c
new file mode 100644
index 0000000..c959fe9
--- /dev/null
+++ b/drivers/hwmon/coretemp-xen.c
@@ -0,0 +1,888 @@
+/*
+ * coretemp.c - Linux kernel module for hardware monitoring
+ *
+ * Copyright (C) 2007 Rudolf Marek <r.marek@assembler.cz>
+ *
+ * Inspired from many hwmon drivers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/hwmon.h>
+#include <linux/sysfs.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/cpu.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/moduleparam.h>
+#include <asm/msr.h>
+#include <asm/cpu_device_id.h>
+#include <xen/pcpu.h>
+#include "../xen/core/domctl.h"
+
+#define DRVNAME	"coretemp"
+
+/*
+ * force_tjmax only matters when TjMax can't be read from the CPU itself.
+ * When set, it replaces the driver's suboptimal heuristic.
+ */
+static int force_tjmax;
+module_param_named(tjmax, force_tjmax, int, 0444);
+MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
+
+#define BASE_SYSFS_ATTR_NO	2	/* Sysfs Base attr no for coretemp */
+#define NUM_REAL_CORES		16	/* Number of Real cores per cpu */
+#define CORETEMP_NAME_LENGTH	17	/* String Length of attrs */
+#define MAX_CORE_ATTRS		4	/* Maximum no of basic attrs */
+#define TOTAL_ATTRS		(MAX_CORE_ATTRS + 1)
+#define MAX_CORE_DATA		(NUM_REAL_CORES + BASE_SYSFS_ATTR_NO)
+
+#define TO_PHYS_ID(cpu)		({ \
+	u32 ppid; \
+	!xen_get_topology_info(cpu, NULL, &ppid, NULL) ? ppid : ~0; \
+})
+#define CORE_ATTR_NO(ccid)	((ccid) + BASE_SYSFS_ATTR_NO)
+
+/*
+ * Per-Core Temperature Data
+ * @last_updated: The time when the current temperature value was updated
+ *		earlier (in jiffies).
+ * @cpu_core_id: The CPU Core from which temperature values should be read
+ *		This value is passed as "id" field to rdmsr/wrmsr functions.
+ * @status_reg: One of IA32_THERM_STATUS or IA32_PACKAGE_THERM_STATUS,
+ *		from where the temperature values should be read.
+ * @attr_size:  Total number of pre-core attrs displayed in the sysfs.
+ * @is_pkg_data: If this is 1, the temp_data holds pkgtemp data.
+ *		Otherwise, temp_data holds coretemp data.
+ * @valid: If this is 1, the current temperature is valid.
+ */
+struct temp_data {
+	int temp;
+	int ttarget;
+	int tjmax;
+	unsigned long last_updated;
+	unsigned int cpu;
+	u32 cpu_core_id;
+	u32 status_reg;
+	int attr_size;
+	bool is_pkg_data;
+	bool valid;
+	struct sensor_device_attribute sd_attrs[TOTAL_ATTRS];
+	char attr_name[TOTAL_ATTRS][CORETEMP_NAME_LENGTH];
+	struct mutex update_lock;
+};
+
+/* Platform Data per Physical CPU */
+struct platform_data {
+	struct device *hwmon_dev;
+	u16 phys_proc_id;
+	u8 x86_model, x86_mask;
+	struct temp_data *core_data[MAX_CORE_DATA];
+	struct device_attribute name_attr;
+};
+
+struct pdev_entry {
+	struct list_head list;
+	struct platform_device *pdev;
+	u16 phys_proc_id;
+};
+
+struct cpu_info {
+	struct platform_device *pdev;
+	u32 cpuid_6_eax, microcode;
+	u32 phys_proc_id, cpu_core_id;
+	u8 x86_model, x86_mask;
+};
+
+static LIST_HEAD(pdev_list);
+static DEFINE_MUTEX(pdev_list_mutex);
+
+static ssize_t show_name(struct device *dev,
+			struct device_attribute *devattr, char *buf)
+{
+	return sprintf(buf, "%s\n", DRVNAME);
+}
+
+static ssize_t show_label(struct device *dev,
+				struct device_attribute *devattr, char *buf)
+{
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+	struct platform_data *pdata = dev_get_drvdata(dev);
+	struct temp_data *tdata = pdata->core_data[attr->index];
+
+	if (tdata->is_pkg_data)
+		return sprintf(buf, "Physical id %u\n", pdata->phys_proc_id);
+
+	return sprintf(buf, "Core %u\n", tdata->cpu_core_id);
+}
+
+static ssize_t show_crit_alarm(struct device *dev,
+				struct device_attribute *devattr, char *buf)
+{
+	u32 eax, edx;
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+	struct platform_data *pdata = dev_get_drvdata(dev);
+	struct temp_data *tdata = pdata->core_data[attr->index];
+
+	if (rdmsr_safe_on_pcpu(tdata->cpu, tdata->status_reg, &eax, &edx) < 0)
+		return sprintf(buf, "\n");
+
+	return sprintf(buf, "%d\n", (eax >> 5) & 1);
+}
+
+static ssize_t show_tjmax(struct device *dev,
+			struct device_attribute *devattr, char *buf)
+{
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+	struct platform_data *pdata = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", pdata->core_data[attr->index]->tjmax);
+}
+
+static ssize_t show_ttarget(struct device *dev,
+				struct device_attribute *devattr, char *buf)
+{
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+	struct platform_data *pdata = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", pdata->core_data[attr->index]->ttarget);
+}
+
+static ssize_t show_temp(struct device *dev,
+			struct device_attribute *devattr, char *buf)
+{
+	u32 eax, edx;
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+	struct platform_data *pdata = dev_get_drvdata(dev);
+	struct temp_data *tdata = pdata->core_data[attr->index];
+
+	mutex_lock(&tdata->update_lock);
+
+	/* Check whether the time interval has elapsed */
+	if (!tdata->valid || time_after(jiffies, tdata->last_updated + HZ)) {
+		if (rdmsr_safe_on_pcpu(tdata->cpu, tdata->status_reg,
+					 &eax, &edx) < 0)
+			eax = ~0;
+		tdata->valid = 0;
+		/* Check whether the data is valid */
+		if (eax & 0x80000000) {
+			tdata->temp = tdata->tjmax -
+					((eax >> 16) & 0x7f) * 1000;
+			tdata->valid = 1;
+		}
+		tdata->last_updated = jiffies;
+	}
+
+	mutex_unlock(&tdata->update_lock);
+	return tdata->valid ? sprintf(buf, "%d\n", tdata->temp) : -EAGAIN;
+}
+
+static int adjust_tjmax(struct platform_data *c, u32 id, struct device *dev)
+{
+	/* The 100C is default for both mobile and non mobile CPUs */
+
+	int tjmax = 100000;
+	int tjmax_ee = 85000;
+	int usemsr_ee = 1;
+	int err;
+	u32 eax, edx;
+	struct pci_dev *host_bridge;
+
+	/* Early chips have no MSR for TjMax */
+
+	if (c->x86_model == 0xf && c->x86_mask < 4)
+		usemsr_ee = 0;
+
+	/* Atom CPUs */
+
+	if (c->x86_model == 0x1c) {
+		usemsr_ee = 0;
+
+		host_bridge = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0));
+
+		if (host_bridge && host_bridge->vendor == PCI_VENDOR_ID_INTEL
+		    && (host_bridge->device == 0xa000	/* NM10 based nettop */
+		    || host_bridge->device == 0xa010))	/* NM10 based netbook */
+			tjmax = 100000;
+		else
+			tjmax = 90000;
+
+		pci_dev_put(host_bridge);
+	}
+
+	if (c->x86_model > 0xe && usemsr_ee) {
+		u8 platform_id;
+
+		/*
+		 * Now we can detect the mobile CPU using Intel provided table
+		 * http://softwarecommunity.intel.com/Wiki/Mobility/720.htm
+		 * For Core2 cores, check MSR 0x17, bit 28 1 = Mobile CPU
+		 */
+		err = rdmsr_safe_on_pcpu(id, 0x17, &eax, &edx);
+		if (err < 0) {
+			dev_warn(dev,
+				 "Unable to access MSR 0x17, assuming desktop"
+				 " CPU\n");
+			usemsr_ee = 0;
+		} else if (c->x86_model < 0x17 && !(eax & 0x10000000)) {
+			/*
+			 * Trust bit 28 up to Penryn, I could not find any
+			 * documentation on that; if you happen to know
+			 * someone at Intel please ask
+			 */
+			usemsr_ee = 0;
+		} else {
+			/* Platform ID bits 52:50 (EDX starts at bit 32) */
+			platform_id = (edx >> 18) & 0x7;
+
+			/*
+			 * Mobile Penryn CPU seems to be platform ID 7 or 5
+			 * (guesswork)
+			 */
+			if (c->x86_model == 0x17 &&
+			    (platform_id == 5 || platform_id == 7)) {
+				/*
+				 * If MSR EE bit is set, set it to 90 degrees C,
+				 * otherwise 105 degrees C
+				 */
+				tjmax_ee = 90000;
+				tjmax = 105000;
+			}
+		}
+	}
+
+	if (usemsr_ee) {
+		err = rdmsr_safe_on_pcpu(id, 0xee, &eax, &edx);
+		if (err < 0) {
+			dev_warn(dev,
+				 "Unable to access MSR 0xEE, for Tjmax, left"
+				 " at default\n");
+		} else if (eax & 0x40000000) {
+			tjmax = tjmax_ee;
+		}
+	} else if (tjmax == 100000) {
+		/*
+		 * If we don't use msr EE it means we are desktop CPU
+		 * (with exeception of Atom)
+		 */
+		dev_warn(dev, "Using relative temperature scale!\n");
+	}
+
+	return tjmax;
+}
+
+static int get_tjmax(struct platform_data *c, u32 id, struct device *dev)
+{
+	int err;
+	u32 eax, edx;
+	u32 val;
+
+	/*
+	 * A new feature of current Intel(R) processors, the
+	 * IA32_TEMPERATURE_TARGET contains the TjMax value
+	 */
+	err = rdmsr_safe_on_pcpu(id, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
+	if (err < 0) {
+		if (c->x86_model > 0xe && c->x86_model != 0x1c)
+			dev_warn(dev, "Unable to read TjMax from CPU %u\n", id);
+	} else {
+		val = (eax >> 16) & 0xff;
+		/*
+		 * If the TjMax is not plausible, an assumption
+		 * will be used
+		 */
+		if (val) {
+			dev_dbg(dev, "TjMax is %d degrees C\n", val);
+			return val * 1000;
+		}
+	}
+
+	if (force_tjmax) {
+		dev_notice(dev, "TjMax forced to %d degrees C by user\n",
+			   force_tjmax);
+		return force_tjmax * 1000;
+	}
+
+	/*
+	 * An assumption is made for early CPUs and unreadable MSR.
+	 * NOTE: the calculated value may not be correct.
+	 */
+	return adjust_tjmax(c, id, dev);
+}
+
+static int create_name_attr(struct platform_data *pdata, struct device *dev)
+{
+	sysfs_attr_init(&pdata->name_attr.attr);
+	pdata->name_attr.attr.name = "name";
+	pdata->name_attr.attr.mode = S_IRUGO;
+	pdata->name_attr.show = show_name;
+	return device_create_file(dev, &pdata->name_attr);
+}
+
+static int create_core_attrs(struct temp_data *tdata, struct device *dev,
+				int attr_no)
+{
+	int err, i;
+	static ssize_t (*const rd_ptr[TOTAL_ATTRS]) (struct device *dev,
+			struct device_attribute *devattr, char *buf) = {
+			show_label, show_crit_alarm, show_temp, show_tjmax,
+			show_ttarget };
+	static const char *const names[TOTAL_ATTRS] = {
+					"temp%d_label", "temp%d_crit_alarm",
+					"temp%d_input", "temp%d_crit",
+					"temp%d_max" };
+
+	for (i = 0; i < tdata->attr_size; i++) {
+		snprintf(tdata->attr_name[i], CORETEMP_NAME_LENGTH, names[i],
+			attr_no);
+		sysfs_attr_init(&tdata->sd_attrs[i].dev_attr.attr);
+		tdata->sd_attrs[i].dev_attr.attr.name = tdata->attr_name[i];
+		tdata->sd_attrs[i].dev_attr.attr.mode = S_IRUGO;
+		tdata->sd_attrs[i].dev_attr.show = rd_ptr[i];
+		tdata->sd_attrs[i].index = attr_no;
+		err = device_create_file(dev, &tdata->sd_attrs[i].dev_attr);
+		if (err)
+			goto exit_free;
+	}
+	return 0;
+
+exit_free:
+	while (--i >= 0)
+		device_remove_file(dev, &tdata->sd_attrs[i].dev_attr);
+	return err;
+}
+
+
+static int chk_ucode_version(unsigned int cpu, const struct cpu_info *c)
+{
+	/*
+	 * Check if we have problem with errata AE18 of Core processors:
+	 * Readings might stop update when processor visited too deep sleep,
+	 * fixed for stepping D0 (6EC).
+	 */
+	if (c->x86_model == 0xe && c->x86_mask < 0xc && c->microcode < 0x39) {
+		pr_err("Errata AE18 not fixed, update BIOS or "
+		       "microcode of the CPU!\n");
+		return -ENODEV;
+	}
+	return 0;
+}
+
+static struct platform_device *coretemp_get_pdev(unsigned int cpu)
+{
+	u16 phys_proc_id = TO_PHYS_ID(cpu);
+	struct pdev_entry *p;
+
+	mutex_lock(&pdev_list_mutex);
+
+	list_for_each_entry(p, &pdev_list, list)
+		if (p->phys_proc_id == phys_proc_id) {
+			mutex_unlock(&pdev_list_mutex);
+			return p->pdev;
+		}
+
+	mutex_unlock(&pdev_list_mutex);
+	return NULL;
+}
+
+static struct temp_data *init_temp_data(unsigned int cpu,
+					const struct cpu_info *c,
+					int pkg_flag)
+{
+	struct temp_data *tdata;
+
+	tdata = kzalloc(sizeof(struct temp_data), GFP_KERNEL);
+	if (!tdata)
+		return NULL;
+
+	tdata->status_reg = pkg_flag ? MSR_IA32_PACKAGE_THERM_STATUS :
+							MSR_IA32_THERM_STATUS;
+	tdata->is_pkg_data = pkg_flag;
+	tdata->cpu = cpu;
+	tdata->cpu_core_id = c->cpu_core_id;
+	tdata->attr_size = MAX_CORE_ATTRS;
+	mutex_init(&tdata->update_lock);
+	return tdata;
+}
+
+static int create_core_data(struct platform_device *pdev,
+			    unsigned int cpu,
+			    const struct cpu_info *c, int pkg_flag)
+{
+	struct temp_data *tdata;
+	struct platform_data *pdata = platform_get_drvdata(pdev);
+	u32 eax, edx;
+	int err, attr_no;
+
+	/*
+	 * Find attr number for sysfs:
+	 * We map the attr number to core id of the CPU
+	 * The attr number is always core id + 2
+	 * The Pkgtemp will always show up as temp1_*, if available
+	 */
+	attr_no = pkg_flag ? 1 : CORE_ATTR_NO(c->cpu_core_id);
+
+	if (attr_no > MAX_CORE_DATA - 1)
+		return -ERANGE;
+
+	/*
+	 * Provide a single set of attributes for all HT siblings of a core
+	 * to avoid duplicate sensors (the processor ID and core ID of all
+	 * HT siblings of a core are the same).
+	 * Skip if a HT sibling of this core is already registered.
+	 * This is not an error.
+	 */
+	if (pdata->core_data[attr_no] != NULL)
+		return 0;
+
+	tdata = init_temp_data(cpu, c, pkg_flag);
+	if (!tdata)
+		return -ENOMEM;
+
+	/* Test if we can access the status register */
+	err = rdmsr_safe_on_pcpu(cpu, tdata->status_reg, &eax, &edx);
+	if (err < 0)
+		goto exit_free;
+
+	/* We can access status register. Get Critical Temperature */
+	tdata->tjmax = get_tjmax(pdata, cpu, &pdev->dev);
+
+	/*
+	 * Read the still undocumented bits 8:15 of IA32_TEMPERATURE_TARGET.
+	 * The target temperature is available on older CPUs but not in this
+	 * register. Atoms don't have the register at all.
+	 */
+	if (c->x86_model > 0xe && c->x86_model != 0x1c) {
+		err = rdmsr_safe_on_pcpu(cpu, MSR_IA32_TEMPERATURE_TARGET,
+					 &eax, &edx);
+		if (err >= 0) {
+			tdata->ttarget
+			  = tdata->tjmax - ((eax >> 8) & 0xff) * 1000;
+			tdata->attr_size++;
+		}
+	}
+
+	pdata->core_data[attr_no] = tdata;
+
+	/* Create sysfs interfaces */
+	err = create_core_attrs(tdata, &pdev->dev, attr_no);
+	if (err)
+		goto exit_free;
+
+	return 0;
+exit_free:
+	pdata->core_data[attr_no] = NULL;
+	kfree(tdata);
+	return err;
+}
+
+static void coretemp_add_core(unsigned int cpu,
+			      const struct cpu_info *c, int pkg_flag)
+{
+	struct platform_device *pdev = c->pdev;
+	int err;
+
+	err = create_core_data(pdev, cpu, c, pkg_flag);
+	if (err)
+		dev_err(&pdev->dev, "Adding Core %u failed\n", cpu);
+}
+
+static void coretemp_remove_core(struct platform_data *pdata,
+				struct device *dev, int indx)
+{
+	int i;
+	struct temp_data *tdata = pdata->core_data[indx];
+
+	/* Remove the sysfs attributes */
+	for (i = 0; i < tdata->attr_size; i++)
+		device_remove_file(dev, &tdata->sd_attrs[i].dev_attr);
+
+	kfree(pdata->core_data[indx]);
+	pdata->core_data[indx] = NULL;
+}
+
+static int coretemp_probe(struct platform_device *pdev)
+{
+	struct platform_data *pdata = platform_get_drvdata(pdev);
+	int err;
+
+	/* Initialize the per-package data structures */
+	err = create_name_attr(pdata, &pdev->dev);
+	if (err)
+		return err;
+
+	pdata->hwmon_dev = hwmon_device_register(&pdev->dev);
+	if (IS_ERR(pdata->hwmon_dev)) {
+		err = PTR_ERR(pdata->hwmon_dev);
+		dev_err(&pdev->dev, "Class registration failed (%d)\n", err);
+		goto exit_name;
+	}
+	return 0;
+
+exit_name:
+	device_remove_file(&pdev->dev, &pdata->name_attr);
+	return err;
+}
+
+static int coretemp_remove(struct platform_device *pdev)
+{
+	struct platform_data *pdata = platform_get_drvdata(pdev);
+	int i;
+
+	for (i = MAX_CORE_DATA - 1; i >= 0; --i)
+		if (pdata->core_data[i])
+			coretemp_remove_core(pdata, &pdev->dev, i);
+
+	device_remove_file(&pdev->dev, &pdata->name_attr);
+	hwmon_device_unregister(pdata->hwmon_dev);
+	platform_set_drvdata(pdev, NULL);
+	kfree(pdata);
+	return 0;
+}
+
+static struct platform_driver coretemp_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = DRVNAME,
+	},
+	.probe = coretemp_probe,
+	.remove = coretemp_remove,
+};
+
+static int coretemp_device_add(unsigned int cpu, struct cpu_info *c)
+{
+	int err;
+	struct platform_device *pdev;
+	struct pdev_entry *pdev_entry;
+	struct platform_data *pdata = NULL;
+
+	mutex_lock(&pdev_list_mutex);
+
+	pdev = platform_device_alloc(DRVNAME, c->phys_proc_id);
+	if (!pdev) {
+		err = -ENOMEM;
+		pr_err("Device allocation failed\n");
+		goto exit;
+	}
+
+	pdata = kzalloc(sizeof(struct platform_data), GFP_KERNEL);
+	if (!pdata) {
+		err = -ENOMEM;
+		goto exit_device_put;
+	}
+
+	pdata->phys_proc_id = c->phys_proc_id;
+	pdata->x86_model = c->x86_model;
+	pdata->x86_mask = c->x86_mask;
+	platform_set_drvdata(pdev, pdata);
+
+	pdev_entry = kzalloc(sizeof(struct pdev_entry), GFP_KERNEL);
+	if (!pdev_entry) {
+		err = -ENOMEM;
+		goto exit_device_put;
+	}
+
+	err = platform_device_add(pdev);
+	if (err) {
+		pr_err("Device addition failed (%d)\n", err);
+		goto exit_device_free;
+	}
+
+	pdev_entry->pdev = pdev;
+	pdev_entry->phys_proc_id = c->phys_proc_id;
+	c->pdev = pdev;
+
+	list_add_tail(&pdev_entry->list, &pdev_list);
+	mutex_unlock(&pdev_list_mutex);
+
+	return 0;
+
+exit_device_free:
+	kfree(pdev_entry);
+exit_device_put:
+	platform_device_put(pdev);
+	kfree(pdata);
+exit:
+	mutex_unlock(&pdev_list_mutex);
+	return err;
+}
+
+static void coretemp_device_remove(unsigned int cpu)
+{
+	struct pdev_entry *p, *n;
+	u16 phys_proc_id = TO_PHYS_ID(cpu);
+
+	mutex_lock(&pdev_list_mutex);
+	list_for_each_entry_safe(p, n, &pdev_list, list) {
+		if (p->phys_proc_id != phys_proc_id)
+			continue;
+		platform_device_unregister(p->pdev);
+		list_del(&p->list);
+		kfree(p);
+	}
+	mutex_unlock(&pdev_list_mutex);
+}
+
+static bool is_any_core_online(struct platform_data *pdata)
+{
+	int i;
+
+	/* Find online cores, except pkgtemp data */
+	for (i = MAX_CORE_DATA - 1; i >= 0; --i) {
+		if (pdata->core_data[i] &&
+			!pdata->core_data[i]->is_pkg_data) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static void get_cpuid_info(void *arg)
+{
+	struct cpu_info *info = arg;
+	u32 val = cpuid_eax(1);
+
+	info->x86_model = ((val >> 4) & 0xf) | ((val >> 12) & 0xf0);
+	info->x86_mask = val & 0xf;
+
+	if (((val >> 8) & 0xf) != 6 || ((val >> 20) & 0xff)
+	    || !info->x86_model
+	    || wrmsr_safe(MSR_IA32_UCODE_REV, 0, 0) < 0
+	    || (sync_core(), rdmsr_safe(MSR_IA32_UCODE_REV,
+					&val, &info->microcode)) < 0)
+		info->microcode = 0;
+
+	info->cpuid_6_eax = cpuid_eax(0) >= 6 ? cpuid_eax(6) : 0;
+}
+
+static void get_core_online(unsigned int cpu)
+{
+	struct cpu_info info;
+	struct platform_device *pdev = coretemp_get_pdev(cpu);
+	int err;
+
+	info.pdev = pdev;
+
+	err = xen_set_physical_cpu_affinity(cpu);
+	if (!err) {
+		get_cpuid_info(&info);
+		WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+	} else if (err > 0) {
+		static bool warned;
+
+		if (!warned) {
+			warned = true;
+			pr_warn(DRVNAME "Cannot set physical CPU affinity"
+				" (assuming use of dom0_vcpus_pin)\n");
+		}
+		err = smp_call_function_single(cpu, get_cpuid_info, &info, 1);
+	}
+	if (err)
+		return;
+
+	/*
+	 * CPUID.06H.EAX[0] indicates whether the CPU has thermal
+	 * sensors. We check this bit only, all the early CPUs
+	 * without thermal sensors will be filtered out.
+	 */
+	if (!(info.cpuid_6_eax & 0x1))
+		return;
+
+	err = xen_get_topology_info(cpu, &info.cpu_core_id,
+				    &info.phys_proc_id, NULL);
+	if (err)
+		return;
+
+	if (!pdev) {
+		/* Check the microcode version of the CPU */
+		if (chk_ucode_version(cpu, &info))
+			return;
+
+		/*
+		 * Alright, we have DTS support.
+		 * We are bringing the _first_ core in this pkg
+		 * online. So, initialize per-pkg data structures and
+		 * then bring this core online.
+		 */
+		err = coretemp_device_add(cpu, &info);
+		if (err)
+			return;
+		/*
+		 * Check whether pkgtemp support is available.
+		 * If so, add interfaces for pkgtemp.
+		 */
+		if (info.cpuid_6_eax & 0x40)
+			coretemp_add_core(cpu, &info, 1);
+	}
+	/*
+	 * Physical CPU device already exists.
+	 * So, just add interfaces for this core.
+	 */
+	coretemp_add_core(cpu, &info, 0);
+}
+
+static void put_core_offline(unsigned int cpu)
+{
+	int i, indx;
+	struct platform_data *pdata;
+	struct platform_device *pdev = coretemp_get_pdev(cpu);
+	u32 cpu_core_id, phys_proc_id;
+
+	/* If the physical CPU device does not exist, just return */
+	if (!pdev)
+		return;
+
+	pdata = platform_get_drvdata(pdev);
+
+	if (xen_get_topology_info(cpu, &cpu_core_id, &phys_proc_id, NULL))
+		return;
+	indx = CORE_ATTR_NO(cpu_core_id);
+
+	if (pdata->core_data[indx] && pdata->core_data[indx]->cpu == cpu)
+		coretemp_remove_core(pdata, &pdev->dev, indx);
+
+	/*
+	 * If a HT sibling of a core is taken offline, but another HT sibling
+	 * of the same core is still online, register the alternate sibling.
+	 * This ensures that exactly one set of attributes is provided as long
+	 * as at least one HT sibling of a core is online.
+	 */
+	for (i = 0; ; ++i) {
+		u32 cid, pid;
+		int err;
+
+		if (i != cpu) {
+			err = xen_get_topology_info(i, &cid, &pid, NULL);
+			if (err == -ENOENT)
+				continue;
+			if (err)
+				break;
+			if (pid != phys_proc_id || cid != cpu_core_id)
+				continue;
+			get_core_online(i);
+			/*
+			 * Display temperature sensor data for one HT sibling
+			 * per core only, so abort the loop after one such
+			 * sibling has been found.
+			 */
+			break;
+		}
+	}
+	/*
+	 * If all cores in this pkg are offline, remove the device.
+	 * coretemp_device_remove calls unregister_platform_device,
+	 * which in turn calls coretemp_remove. This removes the
+	 * pkgtemp entry and does other clean ups.
+	 */
+	if (!is_any_core_online(pdata))
+		coretemp_device_remove(cpu);
+}
+
+static int coretemp_cpu_callback(struct notifier_block *nfb,
+				 unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long) hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+		get_core_online(cpu);
+		break;
+	case CPU_DEAD:
+		put_core_offline(cpu);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block coretemp_cpu_notifier = {
+	.notifier_call = coretemp_cpu_callback,
+};
+
+static const struct x86_cpu_id coretemp_ids[] = {
+	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_DTS },
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, coretemp_ids);
+
+static int __init coretemp_init(void)
+{
+	int err = -ENODEV;
+
+	if (!is_initial_xendomain())
+		goto exit;
+
+	/*
+	 * CPUID.06H.EAX[0] indicates whether the CPU has thermal
+	 * sensors. We check this bit only, all the early CPUs
+	 * without thermal sensors will be filtered out.
+	 */
+	if (!x86_match_cpu(coretemp_ids))
+		return -ENODEV;
+
+	err = platform_driver_register(&coretemp_driver);
+	if (err)
+		goto exit;
+
+	err = register_pcpu_notifier(&coretemp_cpu_notifier);
+	if (err)
+		goto exit_driver_unreg;
+
+#ifndef CONFIG_ACPI_HOTPLUG_CPU
+	if (list_empty(&pdev_list)) {
+		unregister_pcpu_notifier(&coretemp_cpu_notifier);
+		err = -ENODEV;
+		goto exit_driver_unreg;
+	}
+#endif
+
+	return 0;
+
+exit_driver_unreg:
+	platform_driver_unregister(&coretemp_driver);
+exit:
+	return err;
+}
+
+static void __exit coretemp_exit(void)
+{
+	struct pdev_entry *p, *n;
+
+	unregister_pcpu_notifier(&coretemp_cpu_notifier);
+	mutex_lock(&pdev_list_mutex);
+	list_for_each_entry_safe(p, n, &pdev_list, list) {
+		platform_device_unregister(p->pdev);
+		list_del(&p->list);
+		kfree(p);
+	}
+	mutex_unlock(&pdev_list_mutex);
+	platform_driver_unregister(&coretemp_driver);
+}
+
+MODULE_AUTHOR("Rudolf Marek <r.marek@assembler.cz>");
+MODULE_DESCRIPTION("Intel Core temperature monitor");
+MODULE_LICENSE("GPL");
+
+module_init(coretemp_init)
+module_exit(coretemp_exit)
diff --git a/drivers/hwmon/via-cputemp-xen.c b/drivers/hwmon/via-cputemp-xen.c
new file mode 100644
index 0000000..96c17cc
--- /dev/null
+++ b/drivers/hwmon/via-cputemp-xen.c
@@ -0,0 +1,397 @@
+/*
+ * via-cputemp.c - Driver for VIA CPU core temperature monitoring
+ * Copyright (C) 2009 VIA Technologies, Inc.
+ *
+ * based on existing coretemp.c, which is
+ *
+ * Copyright (C) 2007 Rudolf Marek <r.marek@assembler.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-vid.h>
+#include <linux/sysfs.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/cpu.h>
+#include <asm/msr.h>
+#include <asm/cpu_device_id.h>
+#include <xen/pcpu.h>
+#include "../xen/core/domctl.h"
+
+#define DRVNAME	"via_cputemp"
+
+enum { SHOW_TEMP, SHOW_LABEL, SHOW_NAME };
+
+/*
+ * Functions declaration
+ */
+
+struct pdev_entry {
+	struct list_head list;
+	struct platform_device *pdev;
+	struct device *hwmon_dev;
+	const char *name;
+	u8 x86_model;
+	u8 vrm;
+	u32 msr_temp;
+	u32 msr_vid;
+};
+#define via_cputemp_data pdev_entry
+
+/*
+ * Sysfs stuff
+ */
+
+static ssize_t show_name(struct device *dev, struct device_attribute
+			  *devattr, char *buf)
+{
+	int ret;
+	struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+	struct via_cputemp_data *data = dev_get_drvdata(dev);
+
+	if (attr->index == SHOW_NAME)
+		ret = sprintf(buf, "%s\n", data->name);
+	else	/* show label */
+		ret = sprintf(buf, "Core %d\n", data->pdev->id);
+	return ret;
+}
+
+static ssize_t show_temp(struct device *dev,
+			 struct device_attribute *devattr, char *buf)
+{
+	struct via_cputemp_data *data = dev_get_drvdata(dev);
+	u32 eax, edx;
+	int err;
+
+	err = rdmsr_safe_on_pcpu(data->pdev->id, data->msr_temp, &eax, &edx);
+	if (err < 0)
+		return -EAGAIN;
+
+	return sprintf(buf, "%lu\n", ((unsigned long)eax & 0xffffff) * 1000);
+}
+
+static ssize_t show_cpu_vid(struct device *dev,
+			    struct device_attribute *devattr, char *buf)
+{
+	struct via_cputemp_data *data = dev_get_drvdata(dev);
+	u32 eax, edx;
+	int err;
+
+	err = rdmsr_safe_on_pcpu(data->pdev->id, data->msr_vid, &eax, &edx);
+	if (err < 0)
+		return -EAGAIN;
+
+	return sprintf(buf, "%d\n", vid_from_reg(~edx & 0x7f, data->vrm));
+}
+
+static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, show_temp, NULL,
+			  SHOW_TEMP);
+static SENSOR_DEVICE_ATTR(temp1_label, S_IRUGO, show_name, NULL, SHOW_LABEL);
+static SENSOR_DEVICE_ATTR(name, S_IRUGO, show_name, NULL, SHOW_NAME);
+
+static struct attribute *via_cputemp_attributes[] = {
+	&sensor_dev_attr_name.dev_attr.attr,
+	&sensor_dev_attr_temp1_label.dev_attr.attr,
+	&sensor_dev_attr_temp1_input.dev_attr.attr,
+	NULL
+};
+
+static const struct attribute_group via_cputemp_group = {
+	.attrs = via_cputemp_attributes,
+};
+
+/* Optional attributes */
+static DEVICE_ATTR(cpu0_vid, S_IRUGO, show_cpu_vid, NULL);
+
+static int via_cputemp_probe(struct platform_device *pdev)
+{
+	struct via_cputemp_data *data = platform_get_drvdata(pdev);
+	int err;
+	u32 eax, edx;
+
+	data->name = "via_cputemp";
+
+	switch (data->x86_model) {
+	case 0xA:
+		/* C7 A */
+	case 0xD:
+		/* C7 D */
+		data->msr_temp = 0x1169;
+		data->msr_vid = 0x198;
+		break;
+	case 0xF:
+		/* Nano */
+		data->msr_temp = 0x1423;
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	/* test if we can access the TEMPERATURE MSR */
+	err = rdmsr_safe_on_pcpu(pdev->id, data->msr_temp, &eax, &edx);
+	if (err < 0) {
+		dev_err(&pdev->dev,
+			"Unable to access TEMPERATURE MSR, giving up\n");
+		return err;
+	}
+
+	err = sysfs_create_group(&pdev->dev.kobj, &via_cputemp_group);
+	if (err)
+		return err;
+
+	if (data->msr_vid)
+		data->vrm = vid_which_vrm();
+
+	if (data->vrm) {
+		err = device_create_file(&pdev->dev, &dev_attr_cpu0_vid);
+		if (err)
+			goto exit_remove;
+	}
+
+	data->hwmon_dev = hwmon_device_register(&pdev->dev);
+	if (IS_ERR(data->hwmon_dev)) {
+		err = PTR_ERR(data->hwmon_dev);
+		dev_err(&pdev->dev, "Class registration failed (%d)\n",
+			err);
+		goto exit_remove;
+	}
+
+	return 0;
+
+exit_remove:
+	if (data->vrm)
+		device_remove_file(&pdev->dev, &dev_attr_cpu0_vid);
+	sysfs_remove_group(&pdev->dev.kobj, &via_cputemp_group);
+	return err;
+}
+
+static int via_cputemp_remove(struct platform_device *pdev)
+{
+	struct via_cputemp_data *data = platform_get_drvdata(pdev);
+
+	hwmon_device_unregister(data->hwmon_dev);
+	if (data->vrm)
+		device_remove_file(&pdev->dev, &dev_attr_cpu0_vid);
+	sysfs_remove_group(&pdev->dev.kobj, &via_cputemp_group);
+	return 0;
+}
+
+static struct platform_driver via_cputemp_driver = {
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = DRVNAME,
+	},
+	.probe = via_cputemp_probe,
+	.remove = via_cputemp_remove,
+};
+
+static LIST_HEAD(pdev_list);
+static DEFINE_MUTEX(pdev_list_mutex);
+
+struct cpu_info {
+	struct pdev_entry *pdev_entry;
+	u8 x86;
+};
+
+static void get_cpuid_info(void *arg)
+{
+	struct cpu_info *info = arg;
+	struct pdev_entry *pdev_entry = info->pdev_entry;
+	u32 val = cpuid_eax(1);
+
+	info->x86 = ((val >> 8) & 0xf) + ((val >> 20) & 0xff);
+	pdev_entry->x86_model = ((val >> 4) & 0xf) | ((val >> 12) & 0xf0);
+}
+
+static int via_cputemp_device_add(unsigned int cpu)
+{
+	int err;
+	struct cpu_info info;
+	struct platform_device *pdev;
+	struct pdev_entry *pdev_entry;
+
+	pdev_entry = kzalloc(sizeof(*pdev_entry), GFP_KERNEL);
+	if (!pdev_entry)
+		return -ENOMEM;
+
+	info.pdev_entry = pdev_entry;
+	err = xen_set_physical_cpu_affinity(cpu);
+	if (!err) {
+		get_cpuid_info(&info);
+		WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+	} else if (err > 0) {
+		static bool warned;
+
+		if (!warned) {
+			warned = true;
+			printk(KERN_WARNING DRVNAME
+			       "Cannot set physical CPU affinity"
+			       " (assuming use of dom0_vcpus_pin)\n");
+		}
+		err = smp_call_function_single(cpu, get_cpuid_info, &info, 1);
+	}
+	if (err)
+		goto exit_entry_free;
+
+	if (info.x86 != 6)
+		goto exit_entry_free;
+
+	if (pdev_entry->x86_model < 0x0a)
+		goto exit_entry_free;
+
+	if (pdev_entry->x86_model > 0x0f) {
+		pr_warn("Unknown CPU model 0x%x\n", pdev_entry->x86_model);
+		goto exit_entry_free;
+	}
+
+	pdev = platform_device_alloc(DRVNAME, cpu);
+	if (!pdev) {
+		err = -ENOMEM;
+		pr_err("Device allocation failed\n");
+		goto exit_entry_free;
+	}
+
+	platform_set_drvdata(pdev, pdev_entry);
+	pdev_entry->pdev = pdev;
+
+	err = platform_device_add(pdev);
+	if (err) {
+		pr_err("Device addition failed (%d)\n", err);
+		goto exit_device_put;
+	}
+
+	mutex_lock(&pdev_list_mutex);
+	list_add_tail(&pdev_entry->list, &pdev_list);
+	mutex_unlock(&pdev_list_mutex);
+
+	return 0;
+
+exit_device_put:
+	platform_device_put(pdev);
+exit_entry_free:
+	kfree(pdev_entry);
+	return err;
+}
+
+static void via_cputemp_device_remove(unsigned int cpu)
+{
+	struct pdev_entry *p;
+
+	mutex_lock(&pdev_list_mutex);
+	list_for_each_entry(p, &pdev_list, list) {
+		if (p->pdev->id == cpu) {
+			platform_device_unregister(p->pdev);
+			list_del(&p->list);
+			mutex_unlock(&pdev_list_mutex);
+			kfree(p);
+			return;
+		}
+	}
+	mutex_unlock(&pdev_list_mutex);
+}
+
+static int via_cputemp_cpu_callback(struct notifier_block *nfb,
+				 unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long) hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+		via_cputemp_device_add(cpu);
+		break;
+	case CPU_DEAD:
+		via_cputemp_device_remove(cpu);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block via_cputemp_cpu_notifier = {
+	.notifier_call = via_cputemp_cpu_callback,
+};
+
+static const struct x86_cpu_id cputemp_ids[] = {
+	{ X86_VENDOR_CENTAUR, 6, 0xa, }, /* C7 A */
+	{ X86_VENDOR_CENTAUR, 6, 0xd, }, /* C7 D */
+	{ X86_VENDOR_CENTAUR, 6, 0xf, }, /* Nano */
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, cputemp_ids);
+
+static int __init via_cputemp_init(void)
+{
+	int err;
+
+	if (!is_initial_xendomain())
+		return -ENODEV;
+
+	if (!x86_match_cpu(cputemp_ids))
+		return -ENODEV;
+
+	err = platform_driver_register(&via_cputemp_driver);
+	if (err)
+		goto exit;
+
+	err = register_pcpu_notifier(&via_cputemp_cpu_notifier);
+	if (err)
+		goto exit_driver_unreg;
+
+#ifndef CONFIG_ACPI_HOTPLUG_CPU
+	if (list_empty(&pdev_list)) {
+		unregister_pcpu_notifier(&via_cputemp_cpu_notifier);
+		err = -ENODEV;
+		goto exit_driver_unreg;
+	}
+#endif
+
+	return 0;
+
+exit_driver_unreg:
+	platform_driver_unregister(&via_cputemp_driver);
+exit:
+	return err;
+}
+
+static void __exit via_cputemp_exit(void)
+{
+	struct pdev_entry *p, *n;
+
+	unregister_pcpu_notifier(&via_cputemp_cpu_notifier);
+	mutex_lock(&pdev_list_mutex);
+	list_for_each_entry_safe(p, n, &pdev_list, list) {
+		platform_device_unregister(p->pdev);
+		list_del(&p->list);
+		kfree(p);
+	}
+	mutex_unlock(&pdev_list_mutex);
+	platform_driver_unregister(&via_cputemp_driver);
+}
+
+MODULE_AUTHOR("Harald Welte <HaraldWelte@viatech.com>");
+MODULE_DESCRIPTION("VIA CPU temperature monitor");
+MODULE_LICENSE("GPL");
+
+module_init(via_cputemp_init)
+module_exit(via_cputemp_exit)
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index d9c9829..6ca6423 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -19,6 +19,7 @@ void ide_toggle_bounce(ide_drive_t *drive, int on)
 {
 	u64 addr = BLK_BOUNCE_HIGH;	/* dma64_addr_t */
 
+#ifndef CONFIG_XEN
 	if (!PCI_DMA_BUS_IS_PHYS) {
 		addr = BLK_BOUNCE_ANY;
 	} else if (on && drive->media == ide_disk) {
@@ -27,6 +28,16 @@ void ide_toggle_bounce(ide_drive_t *drive, int on)
 		if (dev && dev->dma_mask)
 			addr = *dev->dma_mask;
 	}
+#else
+	if (on && drive->media == ide_disk) {
+		struct device *dev = drive->hwif->dev;
+
+		if (!PCI_DMA_BUS_IS_PHYS)
+			addr = BLK_BOUNCE_ANY;
+		else if (dev && dev->dma_mask)
+			addr = *dev->dma_mask;
+	}
+#endif
 
 	if (drive->queue)
 		blk_queue_bounce_limit(drive->queue, addr);
diff --git a/drivers/idle/Kconfig b/drivers/idle/Kconfig
index 8489eb5..9d643c1 100644
--- a/drivers/idle/Kconfig
+++ b/drivers/idle/Kconfig
@@ -10,7 +10,7 @@ config INTEL_IDLE
 	  processors intel_idle does not support.
 
 menu "Memory power savings"
-depends on X86_64
+depends on X86_64 && !XEN
 
 config I7300_IDLE_IOAT_CHANNEL
 	bool
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 6bea696..b490131 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -4,6 +4,7 @@ config IOMMU_API
 
 menuconfig IOMMU_SUPPORT
 	bool "IOMMU Hardware Support"
+	depends on !XEN
 	default y
 	---help---
 	  Say Y here if you want to compile device drivers for IO Memory
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index f81f3d9..5f510a7 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -427,7 +427,7 @@ config TI_DAC7512
 
 config VMWARE_BALLOON
 	tristate "VMware Balloon Driver"
-	depends on X86
+	depends on X86 && !XEN
 	help
 	  This is VMware physical memory management driver which acts
 	  like a "balloon" that can be inflated to reclaim physical pages
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index b982854..821b8b6 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -295,9 +295,9 @@ source "drivers/net/wimax/Kconfig"
 
 source "drivers/net/wan/Kconfig"
 
-config XEN_NETDEV_FRONTEND
+config PARAVIRT_XEN_NETDEV_FRONTEND
 	tristate "Xen network device frontend driver"
-	depends on XEN
+	depends on PARAVIRT_XEN
 	select XEN_XENBUS_FRONTEND
 	default y
 	help
@@ -306,15 +306,15 @@ config XEN_NETDEV_FRONTEND
 	  domain 0).
 
 	  The corresponding Linux backend driver is enabled by the
-	  CONFIG_XEN_NETDEV_BACKEND option.
+	  PARAVIRT_XEN_NETDEV_BACKEND option.
 
 	  If you are compiling a kernel for use as Xen guest, you
 	  should say Y here. To compile this driver as a module, chose
 	  M here: the module will be called xen-netfront.
 
-config XEN_NETDEV_BACKEND
+config PARAVIRT_XEN_NETDEV_BACKEND
 	tristate "Xen backend network device"
-	depends on XEN_BACKEND
+	depends on PARAVIRT_XEN_BACKEND
 	help
 	  This driver allows the kernel to act as a Xen network driver
 	  domain which exports paravirtual network devices to other
@@ -322,7 +322,7 @@ config XEN_NETDEV_BACKEND
 	  system that implements a compatible front end.
 
 	  The corresponding Linux frontend driver is enabled by the
-	  CONFIG_XEN_NETDEV_FRONTEND configuration option.
+	  PARAVIRT_XEN_NETDEV_FRONTEND configuration option.
 
 	  The backend driver presents a standard network device
 	  endpoint for each paravirtual network device to the driver
@@ -336,7 +336,7 @@ config XEN_NETDEV_BACKEND
 
 config VMXNET3
 	tristate "VMware VMXNET3 ethernet driver"
-	depends on PCI && INET
+	depends on PCI && INET && !XEN
 	help
 	  This driver supports VMware's vmxnet3 virtual ethernet NIC.
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index a6b8ce1..5fdab97 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -56,8 +56,8 @@ obj-$(CONFIG_WLAN) += wireless/
 obj-$(CONFIG_WIMAX) += wimax/
 
 obj-$(CONFIG_VMXNET3) += vmxnet3/
-obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
-obj-$(CONFIG_XEN_NETDEV_BACKEND) += xen-netback/
+obj-$(CONFIG_PARAVIRT_XEN_NETDEV_FRONTEND) += xen-netfront.o
+obj-$(CONFIG_PARAVIRT_XEN_NETDEV_BACKEND) += xen-netback/
 
 obj-$(CONFIG_USB_CATC)          += usb/
 obj-$(CONFIG_USB_KAWETH)        += usb/
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index 857cc25..58e38ce 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -3310,7 +3310,17 @@ static int __devinit init_one(struct pci_dev *pdev,
 	 * register at least one net device.
 	 */
 	for_each_port(adapter, i) {
+#ifndef CONFIG_XEN
 		err = register_netdev(adapter->port[i]);
+#else
+		rtnl_lock();
+		err = register_netdevice(adapter->port[i]);
+		if (!err) {
+			adapter->port[i]->wanted_features &= ~NETIF_F_GRO;
+			netdev_update_features(adapter->port[i]);
+		}
+		rtnl_unlock();
+#endif
 		if (err)
 			dev_warn(&pdev->dev,
 				 "cannot register net device %s, skipping\n",
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
index cfb60e1..d8336b8 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -59,11 +59,24 @@
  * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
  * directly.
  */
+#ifndef CONFIG_XEN
 #define FL0_PG_CHUNK_SIZE  2048
+#else
+/* Use skbuffs for XEN kernels. LRO is already disabled */
+#define FL0_PG_CHUNK_SIZE  0
+#endif
+
 #define FL0_PG_ORDER 0
 #define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER)
+
+#ifndef CONFIG_XEN
 #define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
 #define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
+#else
+#define FL1_PG_CHUNK_SIZE 0
+#define FL1_PG_ORDER 0
+#endif
+
 #define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
 
 #define SGE_RX_DROP_THRES 16
@@ -1268,7 +1281,27 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	gen = q->gen;
 	q->unacked += ndesc;
+#ifdef CONFIG_XEN
+	/*
+	 * Some Guest OS clients get terrible performance when they have bad
+	 * message size / socket send buffer space parameters.  For instance,
+	 * if an application selects an 8KB message size and an 8KB send
+	 * socket buffer size.  This forces the application into a single
+	 * packet stop-and-go mode where it's only willing to have a single
+	 * message outstanding.  The next message is only sent when the
+	 * previous message is noted as having been sent.  Until we issue a
+	 * kfree_skb() against the TX skb, the skb is charged against the
+	 * application's send buffer space.  We only free up TX skbs when we
+	 * get a TX credit return from the hardware / firmware which is fairly
+	 * lazy about this.  So we request a TX WR Completion Notification on
+	 * every TX descriptor in order to accellerate TX credit returns.  See
+	 * also the change in handle_rsp_cntrl_info() to free up TX skb's when
+	 * we receive the TX WR Completion Notifications ...
+	 */
+	compl = F_WR_COMPL;
+#else
 	compl = (q->unacked & 8) << (S_WR_COMPL - 3);
+#endif
 	q->unacked &= 7;
 	pidx = q->pidx;
 	q->pidx += ndesc;
@@ -2154,8 +2187,35 @@ static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
 #endif
 
 	credits = G_RSPD_TXQ0_CR(flags);
-	if (credits)
+	if (credits) {
 		qs->txq[TXQ_ETH].processed += credits;
+#ifdef CONFIG_XEN
+		/*
+		 * In the normal Linux driver t3_eth_xmit() routine, we call
+		 * skb_orphan() on unshared TX skb.  This results in a call to
+		 * the destructor for the skb which frees up the send buffer
+		 * space it was holding down.  This, in turn, allows the
+		 * application to make forward progress generating more data
+		 * which is important at 10Gb/s.  For Virtual Machine Guest
+		 * Operating Systems this doesn't work since the send buffer
+		 * space is being held down in the Virtual Machine.  Thus we
+		 * need to get the TX skb's freed up as soon as possible in
+		 * order to prevent applications from stalling.
+		 *
+		 * This code is largely copied from the corresponding code in
+		 * sge_timer_tx() and should probably be kept in sync with any
+		 * changes there.
+		 */
+		if (__netif_tx_trylock(qs->tx_q)) {
+			struct port_info *pi = netdev_priv(qs->netdev);
+			struct adapter *adap = pi->adapter;
+
+			reclaim_completed_tx(adap, &qs->txq[TXQ_ETH],
+				TX_RECLAIM_CHUNK);
+			__netif_tx_unlock(qs->tx_q);
+		}
+#endif
+	}
 
 	credits = G_RSPD_TXQ2_CR(flags);
 	if (credits)
diff --git a/drivers/net/ethernet/chelsio/cxgb3/version.h b/drivers/net/ethernet/chelsio/cxgb3/version.h
index 8bda06e..3e174a5 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/version.h
+++ b/drivers/net/ethernet/chelsio/cxgb3/version.h
@@ -35,7 +35,11 @@
 #define DRV_DESC "Chelsio T3 Network Driver"
 #define DRV_NAME "cxgb3"
 /* Driver version */
+#ifndef CONFIG_XEN
 #define DRV_VERSION "1.1.4-ko"
+#else
+#define DRV_VERSION "1.1.4-xen-ko"
+#endif
 
 /* Firmware version */
 #define FW_VERSION_MAJOR 7
diff --git a/drivers/net/xen-netback/Makefile b/drivers/net/xen-netback/Makefile
index e346e81..e3072eb 100644
--- a/drivers/net/xen-netback/Makefile
+++ b/drivers/net/xen-netback/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
+obj-$(CONFIG_PARAVIRT_XEN_NETDEV_BACKEND) := xen-netback.o
 
 xen-netback-y := netback.o xenbus.o interface.o
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index f34b5b2..48532f2 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -8,6 +8,10 @@
  * @author Barry Kasindorf
  * @author Robert Richter <robert.richter@amd.com>
  *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
  * This is the core of the buffer management. Each
  * CPU buffer is processed and entered into the
  * global event buffer. Such processing is necessary
@@ -43,6 +47,11 @@ static cpumask_var_t marked_cpus;
 static DEFINE_SPINLOCK(task_mortuary);
 static void process_task_mortuary(void);
 
+#ifdef CONFIG_XEN
+#include <linux/percpu.h>
+static DEFINE_PER_CPU(int, current_domain) = COORDINATOR_DOMAIN;
+#endif
+
 /* Take ownership of the task struct and place it on the
  * list for processing. Only after two full buffer syncs
  * does the task eventually get freed, because by then
@@ -61,7 +70,6 @@ task_free_notify(struct notifier_block *self, unsigned long val, void *data)
 	return NOTIFY_OK;
 }
 
-
 /* The task is on its way out. A sync of the buffer means we can catch
  * any remaining samples for this task.
  */
@@ -151,6 +159,12 @@ static void free_all_tasks(void)
 int sync_start(void)
 {
 	int err;
+#ifdef CONFIG_XEN
+	unsigned int cpu;
+
+	for_each_online_cpu(cpu)
+		per_cpu(current_domain, cpu) = COORDINATOR_DOMAIN;
+#endif
 
 	if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
 		return -ENOMEM;
@@ -287,14 +301,32 @@ static void add_cpu_switch(int i)
 	last_cookie = INVALID_COOKIE;
 }
 
-static void add_kernel_ctx_switch(unsigned int in_kernel)
+static void add_cpu_mode_switch(unsigned int cpu_mode)
 {
 	add_event_entry(ESCAPE_CODE);
-	if (in_kernel)
+	switch (cpu_mode) {
+	case CPU_MODE_USER:
+		add_event_entry(USER_ENTER_SWITCH_CODE);
+		break;
+	case CPU_MODE_KERNEL:
 		add_event_entry(KERNEL_ENTER_SWITCH_CODE);
-	else
-		add_event_entry(KERNEL_EXIT_SWITCH_CODE);
+		break;
+	case CPU_MODE_XEN:
+		add_event_entry(XEN_ENTER_SWITCH_CODE);
+	  	break;
+	default:
+		break;
+	}
+}
+
+#ifdef CONFIG_XEN
+static void add_domain_switch(unsigned long domain_id)
+{
+	add_event_entry(ESCAPE_CODE);
+	add_event_entry(DOMAIN_SWITCH_CODE);
+	add_event_entry(domain_id);
 }
+#endif
 
 static void
 add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
@@ -374,12 +406,12 @@ static inline void add_sample_entry(unsigned long offset, unsigned long event)
  * for later lookup from userspace. Return 0 on failure.
  */
 static int
-add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
+add_sample(struct mm_struct *mm, struct op_sample *s, int cpu_mode)
 {
 	unsigned long cookie;
 	off_t offset;
 
-	if (in_kernel) {
+	if (cpu_mode >= CPU_MODE_KERNEL) {
 		add_sample_entry(s->eip, s->event);
 		return 1;
 	}
@@ -504,7 +536,7 @@ void sync_buffer(int cpu)
 	unsigned long val;
 	struct task_struct *new;
 	unsigned long cookie = 0;
-	int in_kernel = 1;
+	int cpu_mode = CPU_MODE_KERNEL;
 	sync_buffer_state state = sb_buffer_start;
 	unsigned int i;
 	unsigned long available;
@@ -516,6 +548,13 @@ void sync_buffer(int cpu)
 
 	add_cpu_switch(cpu);
 
+#ifdef CONFIG_XEN
+	/* We need to assign the first samples in this CPU buffer to the
+	   same domain that we were processing at the last sync_buffer */
+	if (per_cpu(current_domain, cpu) != COORDINATOR_DOMAIN)
+		add_domain_switch(per_cpu(current_domain, cpu));
+#endif
+
 	op_cpu_buffer_reset(cpu);
 	available = op_cpu_buffer_entries(cpu);
 
@@ -532,10 +571,10 @@ void sync_buffer(int cpu)
 			}
 			if (flags & KERNEL_CTX_SWITCH) {
 				/* kernel/userspace switch */
-				in_kernel = flags & IS_KERNEL;
+				cpu_mode = flags & CPU_MODE_MASK;
 				if (state == sb_buffer_start)
 					state = sb_sample_start;
-				add_kernel_ctx_switch(flags & IS_KERNEL);
+				add_cpu_mode_switch(cpu_mode);
 			}
 			if (flags & USER_CTX_SWITCH
 			    && op_cpu_buffer_get_data(&entry, &val)) {
@@ -548,16 +587,30 @@ void sync_buffer(int cpu)
 					cookie = get_exec_dcookie(mm);
 				add_user_ctx_switch(new, cookie);
 			}
+#ifdef CONFIG_XEN
+			if ((flags & DOMAIN_SWITCH)
+			    && op_cpu_buffer_get_data(&entry, &val)) {
+				per_cpu(current_domain, cpu) = val;
+				add_domain_switch(val);
+			}
+#endif
 			if (op_cpu_buffer_get_size(&entry))
 				add_data(&entry, mm);
 			continue;
 		}
 
+#ifdef CONFIG_XEN
+		if (per_cpu(current_domain, cpu) != COORDINATOR_DOMAIN) {
+			add_sample_entry(sample->eip, sample->event);
+			continue;
+		}
+#endif
+
 		if (state < sb_bt_start)
 			/* ignore sample */
 			continue;
 
-		if (add_sample(mm, sample, in_kernel))
+		if (add_sample(mm, sample, cpu_mode))
 			continue;
 
 		/* ignore backtraces if failed to add a sample */
@@ -568,6 +621,12 @@ void sync_buffer(int cpu)
 	}
 	release_mm(mm);
 
+#ifdef CONFIG_XEN
+	/* We reset domain to COORDINATOR at each CPU switch */
+	if (per_cpu(current_domain, cpu) != COORDINATOR_DOMAIN)
+		add_domain_switch(COORDINATOR_DOMAIN);
+#endif
+
 	mark_done(cpu);
 
 	mutex_unlock(&buffer_mutex);
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index b8ef8dd..b5e539e 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -8,6 +8,10 @@
  * @author Barry Kasindorf <barry.kasindorf@amd.com>
  * @author Robert Richter <robert.richter@amd.com>
  *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
  * Each CPU has a local buffer that stores PC value/event
  * pairs. We also log context switches when we notice them.
  * Eventually each CPU's buffer is processed into the global
@@ -38,6 +42,12 @@ static void wq_sync_buffer(struct work_struct *work);
 #define DEFAULT_TIMER_EXPIRE (HZ / 10)
 static int work_enabled;
 
+#ifndef CONFIG_XEN
+#define current_domain COORDINATOR_DOMAIN
+#else
+static int32_t current_domain = COORDINATOR_DOMAIN;
+#endif
+
 unsigned long oprofile_get_cpu_buffer_size(void)
 {
 	return oprofile_cpu_buffer_size;
@@ -75,7 +85,7 @@ int alloc_cpu_buffers(void)
 		struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
 
 		b->last_task = NULL;
-		b->last_is_kernel = -1;
+		b->last_cpu_mode = -1;
 		b->tracing = 0;
 		b->buffer_size = buffer_size;
 		b->sample_received = 0;
@@ -180,7 +190,7 @@ unsigned long op_cpu_buffer_entries(int cpu)
 
 static int
 op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
-	    int is_kernel, struct task_struct *task)
+	    int cpu_mode, struct task_struct *task)
 {
 	struct op_entry entry;
 	struct op_sample *sample;
@@ -193,16 +203,15 @@ op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
 		flags |= TRACE_BEGIN;
 
 	/* notice a switch from user->kernel or vice versa */
-	is_kernel = !!is_kernel;
-	if (cpu_buf->last_is_kernel != is_kernel) {
-		cpu_buf->last_is_kernel = is_kernel;
-		flags |= KERNEL_CTX_SWITCH;
-		if (is_kernel)
-			flags |= IS_KERNEL;
+	if (cpu_buf->last_cpu_mode != cpu_mode) {
+		cpu_buf->last_cpu_mode = cpu_mode;
+		flags |= KERNEL_CTX_SWITCH | cpu_mode;
 	}
 
 	/* notice a task switch */
-	if (cpu_buf->last_task != task) {
+	/* if not processing other domain samples */
+	if (cpu_buf->last_task != task &&
+	    current_domain == COORDINATOR_DOMAIN) {
 		cpu_buf->last_task = task;
 		flags |= USER_CTX_SWITCH;
 	}
@@ -251,14 +260,14 @@ op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
 /*
  * This must be safe from any context.
  *
- * is_kernel is needed because on some architectures you cannot
+ * cpu_mode is needed because on some architectures you cannot
  * tell if you are in kernel or user space simply by looking at
- * pc. We tag this in the buffer by generating kernel enter/exit
- * events whenever is_kernel changes
+ * pc. We tag this in the buffer by generating kernel/user (and
+ * xen) enter events whenever cpu_mode changes
  */
 static int
 log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
-	   unsigned long backtrace, int is_kernel, unsigned long event,
+	   unsigned long backtrace, int cpu_mode, unsigned long event,
 	   struct task_struct *task)
 {
 	struct task_struct *tsk = task ? task : current;
@@ -269,7 +278,7 @@ log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
 		return 0;
 	}
 
-	if (op_add_code(cpu_buf, backtrace, is_kernel, tsk))
+	if (op_add_code(cpu_buf, backtrace, cpu_mode, tsk))
 		goto fail;
 
 	if (op_add_sample(cpu_buf, pc, event))
@@ -416,6 +425,20 @@ void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 	log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
 }
 
+#ifdef CONFIG_XEN
+/*
+ * This is basically log_sample(b, ESCAPE_CODE, 1, cpu_mode, CPU_TRACE_BEGIN),
+ * as was previously accessible through oprofile_add_pc().
+ */
+void oprofile_add_mode(int cpu_mode)
+{
+	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
+
+	if (op_add_code(cpu_buf, 1, cpu_mode, current))
+		cpu_buf->sample_lost_overflow++;
+}
+#endif
+
 void oprofile_add_trace(unsigned long pc)
 {
 	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
@@ -440,6 +463,28 @@ fail:
 	return;
 }
 
+#ifdef CONFIG_XEN
+int oprofile_add_domain_switch(int32_t domain_id)
+{
+	struct op_entry entry;
+	struct op_sample *sample;
+
+	sample = op_cpu_buffer_write_reserve(&entry, 1);
+	if (!sample)
+		return 0;
+
+	sample->eip = ESCAPE_CODE;
+	sample->event = DOMAIN_SWITCH;
+
+	op_cpu_buffer_add_data(&entry, domain_id);
+	op_cpu_buffer_write_commit(&entry);
+
+	current_domain = domain_id;
+
+	return 1;
+}
+#endif
+
 /*
  * This serves to avoid cpu buffer overflow, and makes sure
  * the task mortuary progresses
diff --git a/drivers/oprofile/cpu_buffer.h b/drivers/oprofile/cpu_buffer.h
index e1d097e..07c8976 100644
--- a/drivers/oprofile/cpu_buffer.h
+++ b/drivers/oprofile/cpu_buffer.h
@@ -41,7 +41,7 @@ struct op_entry;
 struct oprofile_cpu_buffer {
 	unsigned long buffer_size;
 	struct task_struct *last_task;
-	int last_is_kernel;
+	int last_cpu_mode;
 	int tracing;
 	unsigned long sample_received;
 	unsigned long sample_lost_overflow;
@@ -63,7 +63,7 @@ static inline void op_cpu_buffer_reset(int cpu)
 {
 	struct oprofile_cpu_buffer *cpu_buf = &per_cpu(op_cpu_buffer, cpu);
 
-	cpu_buf->last_is_kernel = -1;
+	cpu_buf->last_cpu_mode = -1;
 	cpu_buf->last_task = NULL;
 }
 
@@ -113,9 +113,13 @@ int op_cpu_buffer_get_data(struct op_entry *entry, unsigned long *val)
 }
 
 /* extra data flags */
-#define KERNEL_CTX_SWITCH	(1UL << 0)
-#define IS_KERNEL		(1UL << 1)
+#define CPU_MODE_USER		0
+#define CPU_MODE_KERNEL		1
+#define CPU_MODE_XEN		2
+#define CPU_MODE_MASK		3
 #define TRACE_BEGIN		(1UL << 2)
 #define USER_CTX_SWITCH		(1UL << 3)
+#define KERNEL_CTX_SWITCH	(1UL << 4)
+#define DOMAIN_SWITCH		(1UL << 5)
 
 #endif /* OPROFILE_CPU_BUFFER_H */
diff --git a/drivers/oprofile/event_buffer.h b/drivers/oprofile/event_buffer.h
index a8d5bb3..bae7b80 100644
--- a/drivers/oprofile/event_buffer.h
+++ b/drivers/oprofile/event_buffer.h
@@ -30,6 +30,9 @@ void wake_up_buffer_waiter(void);
 #define INVALID_COOKIE ~0UL
 #define NO_COOKIE 0UL
 
+/* Constant used to refer to coordinator domain (Xen) */
+#define COORDINATOR_DOMAIN -1
+
 extern const struct file_operations event_buffer_fops;
 
 /* mutex between sync_cpu_buffers() and the
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index ed2c3ec..5ece800 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -5,6 +5,10 @@
  * @remark Read the file COPYING
  *
  * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
  */
 
 #include <linux/kernel.h>
@@ -35,6 +39,34 @@ static DEFINE_MUTEX(start_mutex);
  */
 static int timer = 0;
 
+#ifdef CONFIG_XEN
+int oprofile_set_active(int active_domains[], unsigned int adomains)
+{
+	int err;
+
+	if (!oprofile_ops.set_active)
+		return -EINVAL;
+
+	mutex_lock(&start_mutex);
+	err = oprofile_ops.set_active(active_domains, adomains);
+	mutex_unlock(&start_mutex);
+	return err;
+}
+
+int oprofile_set_passive(int passive_domains[], unsigned int pdomains)
+{
+	int err;
+
+	if (!oprofile_ops.set_passive)
+		return -EINVAL;
+
+	mutex_lock(&start_mutex);
+	err = oprofile_ops.set_passive(passive_domains, pdomains);
+	mutex_unlock(&start_mutex);
+	return err;
+}
+#endif
+
 int oprofile_setup(void)
 {
 	int err;
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
index d32ef81..a56d53d 100644
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -48,4 +48,7 @@ static inline int op_nmi_timer_init(struct oprofile_operations *ops)
 int oprofile_set_ulong(unsigned long *addr, unsigned long val);
 int oprofile_set_timeout(unsigned long time);
 
+int oprofile_set_active(int active_domains[], unsigned int adomains);
+int oprofile_set_passive(int passive_domains[], unsigned int pdomains);
+
 #endif /* OPROF_H */
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c
index 84a208d..51d26ea 100644
--- a/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@ -5,11 +5,17 @@
  * @remark Read the file COPYING
  *
  * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
  */
 
 #include <linux/fs.h>
 #include <linux/oprofile.h>
 #include <linux/jiffies.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
 
 #include "event_buffer.h"
 #include "oprofile_stats.h"
@@ -175,6 +181,141 @@ static const struct file_operations dump_fops = {
 	.llseek		= noop_llseek,
 };
 
+#ifdef CONFIG_XEN
+#include <linux/slab.h>
+
+#define TMPBUFSIZE 512
+
+struct domain_data {
+    unsigned int nr;
+    int ids[MAX_OPROF_DOMAINS + 1];
+    struct mutex mutex;
+    int (*set)(int[], unsigned int);
+};
+#define DEFINE_DOMAIN_DATA(what) \
+	struct domain_data what##_domains = { \
+		.mutex = __MUTEX_INITIALIZER(what##_domains.mutex), \
+		.set = oprofile_set_##what \
+	}
+
+static ssize_t domain_write(struct file *filp, char const __user *buf,
+			    size_t count, loff_t *offset)
+{
+	struct domain_data *dom = filp->private_data;
+	char *tmpbuf;
+	char *startp, *endp;
+	int i;
+	unsigned long val;
+	ssize_t retval = count;
+
+	if (*offset)
+		return -EINVAL;
+	if (count > TMPBUFSIZE - 1)
+		return -EINVAL;
+
+	if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
+		return -ENOMEM;
+
+	if (copy_from_user(tmpbuf, buf, count)) {
+		kfree(tmpbuf);
+		return -EFAULT;
+	}
+	tmpbuf[count] = 0;
+
+	mutex_lock(&dom->mutex);
+
+	startp = tmpbuf;
+	/* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
+	for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
+		val = simple_strtoul(startp, &endp, 0);
+		if (endp == startp)
+			break;
+		while (ispunct(*endp) || isspace(*endp))
+			endp++;
+		dom->ids[i] = val;
+		if (dom->ids[i] != val)
+			/* Overflow, force error below */
+			i = MAX_OPROF_DOMAINS + 1;
+		startp = endp;
+	}
+	/* Force error on trailing junk */
+	dom->nr = *startp ? MAX_OPROF_DOMAINS + 1 : i;
+
+	kfree(tmpbuf);
+
+	if (dom->nr > MAX_OPROF_DOMAINS
+	    || dom->set(dom->ids, dom->nr)) {
+		dom->nr = 0;
+		retval = -EINVAL;
+	}
+
+	mutex_unlock(&dom->mutex);
+	return retval;
+}
+
+static ssize_t domain_read(struct file *filp, char __user *buf,
+			    size_t count, loff_t *offset)
+{
+	struct domain_data *dom = filp->private_data;
+	char *tmpbuf;
+	size_t len;
+	int i;
+	ssize_t retval;
+
+	if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
+		return -ENOMEM;
+
+	mutex_lock(&dom->mutex);
+
+	len = 0;
+	for (i = 0; i < dom->nr; i++)
+		len += snprintf(tmpbuf + len,
+				len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
+				"%u ", dom->ids[i]);
+	WARN_ON(len > TMPBUFSIZE);
+	if (len != 0 && len <= TMPBUFSIZE)
+		tmpbuf[len-1] = '\n';
+
+	mutex_unlock(&dom->mutex);
+
+	retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
+
+	kfree(tmpbuf);
+	return retval;
+}
+
+static DEFINE_DOMAIN_DATA(active);
+
+static int adomain_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = &active_domains;
+	return 0;
+}
+
+static const struct file_operations active_domain_ops = {
+	.open		= adomain_open,
+	.read		= domain_read,
+	.write		= domain_write,
+	.llseek		= default_llseek,
+};
+
+static DEFINE_DOMAIN_DATA(passive);
+
+static int pdomain_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = &passive_domains;
+	return 0;
+}
+
+static const struct file_operations passive_domain_ops = {
+	.open		= pdomain_open,
+	.read		= domain_read,
+	.write		= domain_write,
+	.llseek		= default_llseek,
+};
+
+#endif /* CONFIG_XEN */
+
 void oprofile_create_files(struct super_block *sb, struct dentry *root)
 {
 	/* reinitialize default values */
@@ -185,6 +326,10 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root)
 
 	oprofilefs_create_file(sb, root, "enable", &enable_fops);
 	oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
+#ifdef CONFIG_XEN
+	oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops);
+	oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops);
+#endif
 	oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
 	oprofilefs_create_ulong(sb, root, "buffer_size", &oprofile_buffer_size);
 	oprofilefs_create_ulong(sb, root, "buffer_watershed", &oprofile_buffer_watershed);
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 37856f7..c8f8a33 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -31,6 +31,27 @@ config PCI_DEBUG
 
 	  When in doubt, say N.
 
+config PCI_GUESTDEV
+	bool "PCI Device Reservation for Passthrough"
+	depends on PCI && ACPI && XEN
+	default y
+	help
+	  Say Y here if you want to reserve PCI device for passthrough.
+
+config PCI_IOMULTI
+	tristate "PCI Device IO Multiplex for Passthrough"
+	depends on PCI && ACPI && XEN
+	default y
+	help
+	  Say Y here if you need io multiplexing.
+
+config PCI_RESERVE
+	bool "PCI IO/MEMORY space reserve"
+	depends on PCI && XEN_PRIVILEGED_GUEST
+	default y
+	help
+	  Say Y here if you need PCI IO/MEMORY space reserve
+
 config PCI_STUB
 	tristate "PCI Stub driver"
 	depends on PCI
@@ -40,9 +61,9 @@ config PCI_STUB
 
 	  When in doubt, say N.
 
-config XEN_PCIDEV_FRONTEND
+config PARAVIRT_XEN_PCIDEV_FRONTEND
         tristate "Xen PCI Frontend"
-        depends on PCI && X86 && XEN
+        depends on PCI && X86 && PARAVIRT_XEN
         select HOTPLUG
         select PCI_XEN
 	select XEN_XENBUS_FRONTEND
@@ -51,10 +72,30 @@ config XEN_PCIDEV_FRONTEND
           The PCI device frontend driver allows the kernel to import arbitrary
           PCI devices from a PCI backend to support PCI driver domains.
 
+config XEN_PCIDEV_FRONTEND
+	def_bool y
+	prompt "Xen PCI Frontend" if X86_64 && !XEN_UNPRIVILEGED_GUEST
+	depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
+ 	select HOTPLUG
+	help
+	  The PCI device frontend driver allows the kernel to import arbitrary
+	  PCI devices from a PCI backend to support PCI driver domains.
+
+config XEN_PCIDEV_FE_DEBUG
+        bool "Xen PCI Frontend debugging"
+        depends on XEN_PCIDEV_FRONTEND
+	help
+	  Say Y here if you want the Xen PCI frontend to produce a bunch of debug
+	  messages to the system log.  Select this if you are having a
+	  problem with Xen PCI frontend support and want to see more of what is
+	  going on.
+
+	  When in doubt, say N.
+
 config HT_IRQ
 	bool "Interrupts on hypertransport devices"
 	default y
-	depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
+	depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN
 	help
 	   This allows native hypertransport devices to use interrupts.
 
@@ -99,7 +140,7 @@ config PCI_PASID
 
 config PCI_IOAPIC
 	tristate "PCI IO-APIC hotplug support" if X86
-	depends on PCI
+	depends on PCI && !XEN
 	depends on ACPI
 	depends on HOTPLUG
 	default !X86
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 083a49f..4744c56 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -7,6 +7,11 @@ obj-y		+= access.o bus.o probe.o remove.o pci.o \
 			irq.o vpd.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_SYSFS) += slot.o
+obj-$(CONFIG_PCI_GUESTDEV) += guestdev.o
+obj-$(CONFIG_PCI_IOMULTI) += pci-iomul.o
+iomul-$(CONFIG_PCI_IOMULTI) := iomulti.o
+obj-y += $(iomul-y) $(iomul-m)
+obj-$(CONFIG_PCI_RESERVE) += reserve.o
 
 obj-$(CONFIG_PCI_QUIRKS) += quirks.o
 
@@ -65,7 +70,7 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o
 
 obj-$(CONFIG_PCI_STUB) += pci-stub.o
 
-obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
+obj-$(CONFIG_PARAVIRT_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 
 obj-$(CONFIG_OF) += of.o
 
diff --git a/drivers/pci/guestdev.c b/drivers/pci/guestdev.c
new file mode 100644
index 0000000..2af2ff4
--- /dev/null
+++ b/drivers/pci/guestdev.c
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2008, 2009 NEC Corporation.
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/acpi.h>
+#include <asm/setup.h>
+
+#define HID_LEN 8
+#define UID_LEN 8
+#define DEV_LEN 2
+#define FUNC_LEN 1
+#define DEV_NUM_MAX 31
+#define FUNC_NUM_MAX 7
+#define INVALID_SEG (-1)
+#define INVALID_BBN (-1)
+#define GUESTDEV_STR_MAX 128
+
+#define GUESTDEV_FLAG_TYPE_MASK 0x3
+#define GUESTDEV_FLAG_DEVICEPATH 0x1
+#define GUESTDEV_FLAG_SBDF 0x2
+
+#define GUESTDEV_OPT_IOMUL	0x1
+
+struct guestdev {
+	int flags;
+	int options;
+	struct list_head root_list;
+	union {
+		struct devicepath {
+			char hid[HID_LEN + 1];
+			char uid[UID_LEN + 1];
+			int seg;
+			int bbn;
+			struct devicepath_node *child;
+		} devicepath;
+		struct sbdf {
+			int seg;
+			int bus;
+			int dev;
+			int func;
+		} sbdf;
+	} u;
+};
+
+struct devicepath_node {
+	int dev;
+	int func;
+	struct devicepath_node *child;
+};
+
+struct pcidev_sbdf {
+	int seg;
+	int bus;
+	struct pcidev_sbdf_node *child;
+};
+
+struct pcidev_sbdf_node {
+	int dev;
+	int func;
+	struct pcidev_sbdf_node *child;
+};
+
+static char __initdata guestdev_param[COMMAND_LINE_SIZE];
+static LIST_HEAD(guestdev_list);
+
+/* Get hid and uid */
+static int __init pci_get_hid_uid(char *str, char *hid, char *uid)
+{
+	char *sp, *ep;
+	int len;
+
+	sp = str;
+	ep = strchr(sp, ':');
+	if (!ep) {
+		ep = strchr(sp, '-');
+		if (!ep)
+			goto format_err_end;
+	}
+	/* hid length */
+	len = ep - sp;
+	if (len <= 0 || HID_LEN < len)
+		goto format_err_end;
+
+	strlcpy(hid, sp, len);
+
+	if (*ep == '-') { /* no uid */
+		uid[0] = '\0';
+		return TRUE;
+	}
+
+	sp = ep + 1;
+	ep = strchr(sp, '-');
+	if (!ep)
+		ep = strchr(sp, '\0');
+
+	/* uid length */
+	len = ep - sp;
+	if (len <= 0 || UID_LEN < len)
+		goto format_err_end;
+
+	strlcpy(uid, sp, len);
+	return TRUE;
+
+format_err_end:
+	return FALSE;
+}
+
+/* Get device and function */
+static int __init pci_get_dev_func(char *str, int *dev, int *func)
+{
+	if (sscanf(str, "%02x.%01x", dev, func) != 2)
+		goto format_err_end;
+
+	if (*dev < 0 || DEV_NUM_MAX < *dev)
+		goto format_err_end;
+
+	if (*func < 0 || FUNC_NUM_MAX < *func)
+		goto format_err_end;
+
+	return TRUE;
+
+format_err_end:
+	return FALSE;
+}
+
+/* Check extended guestdev parameter format error */
+static int __init pci_check_extended_guestdev_format(char *str)
+{
+	int flg;
+	char *p;
+
+	/* Check extended format */
+	if (strpbrk(str, "(|)") == NULL)
+		return TRUE;
+
+	flg = 0;
+	p = str;
+	while (*p) {
+		switch (*p) {
+		case '(':
+			/* Check nesting error */
+			if (flg != 0)
+				goto format_err_end;
+			flg = 1;
+			/* Check position of '(' is head or
+			   previos charactor of '(' is not '-'. */
+			if (p == str || *(p - 1) != '-')
+				goto format_err_end;
+			break;
+		case ')':
+			/* Check nesting error */
+			if (flg != 1)
+				goto format_err_end;
+			flg = 0;
+			/* Check next charactor of ')' is not '\0' */
+			if (*(p + 1) != '\0')
+				goto format_err_end;
+			break;
+		case '|':
+			/* Check position of '|' is outside of '(' and ')' */
+			if (flg != 1)
+				goto format_err_end;
+			break;
+		default:
+			break;
+		}
+		p++;
+	}
+	/* Check number of '(' and ')' are not equal */
+	if (flg != 0)
+		goto format_err_end;
+	return TRUE;
+
+format_err_end:
+	pr_err("PCI: The format of the guestdev parameter is illegal. [%s]\n",
+	       str);
+	return FALSE;
+}
+
+/* Make guestdev strings */
+static void pci_make_guestdev_str(struct guestdev *gdev,
+					char *gdev_str, int buf_size)
+{
+	struct devicepath_node *node;
+	int count;
+
+	switch (gdev->flags & GUESTDEV_FLAG_TYPE_MASK) {
+	case GUESTDEV_FLAG_DEVICEPATH:
+		memset(gdev_str, 0, buf_size);
+
+		if (strlen(gdev->u.devicepath.uid))
+			count = snprintf(gdev_str, buf_size, "%s:%s",
+						gdev->u.devicepath.hid,
+						gdev->u.devicepath.uid);
+		else
+			count = snprintf(gdev_str, buf_size, "%s",
+						 gdev->u.devicepath.hid);
+		if (count < 0)
+			return;
+
+		node = gdev->u.devicepath.child;
+		while (node) {
+			gdev_str += count;
+			buf_size -= count;
+			if (buf_size <= 0)
+				return;
+			count = snprintf(gdev_str, buf_size, "-%02x.%01x",
+				node->dev, node->func);
+			if (count < 0)
+				return;
+			node = node->child;
+		}
+		break;
+	case GUESTDEV_FLAG_SBDF:
+		snprintf(gdev_str, buf_size, "%04x:%02x:%02x.%01x",
+					gdev->u.sbdf.seg, gdev->u.sbdf.bus,
+					gdev->u.sbdf.dev, gdev->u.sbdf.func);
+		break;
+	default:
+		BUG();
+	}
+}
+
+/* Free guestdev and nodes */
+static void __init pci_free_guestdev(struct guestdev *gdev)
+{
+	struct devicepath_node *node, *next;
+
+	if (!gdev)
+		return;
+	if (gdev->flags & GUESTDEV_FLAG_DEVICEPATH) {
+		node = gdev->u.devicepath.child;
+		while (node) {
+			next = node->child;
+			kfree(node);
+			node = next;
+		}
+	}
+	list_del(&gdev->root_list);
+	kfree(gdev);
+}
+
+/* Copy guestdev and nodes */
+struct guestdev __init *pci_copy_guestdev(struct guestdev *gdev_src)
+{
+	struct guestdev *gdev;
+	struct devicepath_node *node, *node_src, *node_upper;
+
+	BUG_ON(!(gdev_src->flags & GUESTDEV_FLAG_DEVICEPATH));
+
+	gdev = kzalloc(sizeof(*gdev), GFP_KERNEL);
+	if (!gdev)
+		goto allocate_err_end;
+
+	INIT_LIST_HEAD(&gdev->root_list);
+	gdev->flags = gdev_src->flags;
+	gdev->options = gdev_src->options;
+	strcpy(gdev->u.devicepath.hid, gdev_src->u.devicepath.hid);
+	strcpy(gdev->u.devicepath.uid, gdev_src->u.devicepath.uid);
+	gdev->u.devicepath.seg = gdev_src->u.devicepath.seg;
+	gdev->u.devicepath.bbn = gdev_src->u.devicepath.bbn;
+
+	node_upper = NULL;
+
+	node_src = gdev_src->u.devicepath.child;
+	while (node_src) {
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		if (!node)
+			goto allocate_err_end;
+		node->dev = node_src->dev;
+		node->func = node_src->func;
+		if (!node_upper)
+			gdev->u.devicepath.child = node;
+		else
+			node_upper->child = node;
+		node_upper = node;
+		node_src = node_src->child;
+	}
+
+	return gdev;
+
+allocate_err_end:
+	if (gdev)
+		pci_free_guestdev(gdev);
+	pr_err("PCI: failed to allocate memory\n");
+	return NULL;
+}
+
+/* Make guestdev from path strings */
+static int __init pci_make_devicepath_guestdev(char *path_str, int options)
+{
+	char hid[HID_LEN + 1], uid[UID_LEN + 1];
+	char *sp, *ep;
+	struct guestdev *gdev, *gdev_org;
+	struct devicepath_node *node, *node_tmp;
+	int dev, func, ret_val;
+
+	ret_val = 0;
+	gdev = gdev_org = NULL;
+	sp = path_str;
+	/* Look for end of hid:uid'-' */
+	ep = strchr(sp, '-');
+	/* Only hid, uid. (No dev, func) */
+	if (!ep)
+		goto format_err_end;
+
+	memset(hid, 0 ,sizeof(hid));
+	memset(uid, 0, sizeof(uid));
+	if (!pci_get_hid_uid(sp, hid, uid))
+		goto format_err_end;
+
+	gdev_org = kzalloc(sizeof(*gdev_org), GFP_KERNEL);
+	if (!gdev_org)
+		goto allocate_err_end;
+	INIT_LIST_HEAD(&gdev_org->root_list);
+	gdev_org->flags = GUESTDEV_FLAG_DEVICEPATH;
+	gdev_org->options = options;
+	strcpy(gdev_org->u.devicepath.hid, hid);
+	strcpy(gdev_org->u.devicepath.uid, uid);
+	gdev_org->u.devicepath.seg = INVALID_SEG;
+	gdev_org->u.devicepath.bbn = INVALID_BBN;
+
+	gdev = gdev_org;
+
+	sp = ep + 1;
+	ep = sp;
+	do {
+		if (*sp == '(') {
+			sp++;
+			if (strchr(sp, '|')) {
+				gdev = pci_copy_guestdev(gdev_org);
+				if (!gdev) {
+					ret_val = -ENOMEM;
+					goto end;
+				}
+			}
+			continue;
+		}
+		if (gdev && pci_get_dev_func(sp, &dev, &func)) {
+			node = kzalloc(sizeof(*node), GFP_KERNEL);
+			if (!node)
+				goto allocate_err_end;
+			node->dev = dev;
+			node->func = func;
+			/* add node to end of guestdev */
+			if (gdev->u.devicepath.child) {
+				node_tmp = gdev->u.devicepath.child;
+				while (node_tmp->child) {
+					node_tmp = node_tmp->child;
+				}
+				node_tmp->child = node;
+			} else
+				gdev->u.devicepath.child = node;
+		} else if (gdev) {
+			pr_err("PCI: Can't obtain dev# and #func# from %s.\n",
+			       sp);
+			ret_val = -EINVAL;
+			if (gdev == gdev_org)
+				goto end;
+			pci_free_guestdev(gdev);
+			gdev = NULL;
+		}
+
+		ep = strpbrk(sp, "-|)");
+		if (!ep)
+			ep = strchr(sp, '\0');
+		/* Is *ep '|' OR ')' OR '\0' ? */
+		if (*ep != '-') {
+			if (gdev)
+				list_add_tail(&gdev->root_list, &guestdev_list);
+			if (*ep == '|') {
+				/* Between '|' and '|' ? */
+				if (strchr(ep + 1, '|')) {
+					gdev = pci_copy_guestdev(gdev_org);
+					if (!gdev) {
+						ret_val = -ENOMEM;
+						goto end;
+					}
+				} else {
+					gdev = gdev_org;
+					gdev_org = NULL;
+				}
+			} else {
+				gdev_org = NULL;
+				gdev = NULL;
+			}
+		}
+		if (*ep == ')')
+			ep++;
+		sp = ep + 1;
+	} while (*ep != '\0');
+
+	goto end;
+
+format_err_end:
+	pr_err("PCI: The format of the guestdev parameter is illegal. [%s]\n",
+	       path_str);
+	ret_val = -EINVAL;
+	goto end;
+
+allocate_err_end:
+	pr_err("PCI: failed to allocate memory\n");
+	ret_val = -ENOMEM;
+	goto end;
+
+end:
+	if (gdev_org && (gdev_org != gdev))
+		pci_free_guestdev(gdev_org);
+	if (gdev)
+		pci_free_guestdev(gdev);
+	return ret_val;
+}
+
+static int __init pci_make_sbdf_guestdev(char* str, int options)
+{
+	struct guestdev *gdev;
+	int seg, bus, dev, func;
+
+	if (sscanf(str, "%x:%x:%x.%x", &seg, &bus, &dev, &func) != 4) {
+		seg = 0;
+		if (sscanf(str, "%x:%x.%x", &bus, &dev, &func) != 3)
+			return -EINVAL;
+	}
+	gdev = kmalloc(sizeof(*gdev), GFP_KERNEL);
+	if (!gdev) {
+		pr_err("PCI: failed to allocate memory\n");
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&gdev->root_list);
+	gdev->flags = GUESTDEV_FLAG_SBDF;
+	gdev->options = options;
+	gdev->u.sbdf.seg = seg;
+	gdev->u.sbdf.bus = bus;
+	gdev->u.sbdf.dev = dev;
+	gdev->u.sbdf.func = func;
+	list_add_tail(&gdev->root_list, &guestdev_list);
+	return 0;
+}
+
+static int __init pci_parse_options(const char *str)
+{
+	int options = 0;
+	char *ep;
+
+	while (str) {
+		str++;
+		ep = strchr(str, '+');
+		if (ep)
+			ep = '\0';	/* Chop */
+
+		if (!strcmp(str, "iomul"))
+			options |= GUESTDEV_OPT_IOMUL;
+
+		str = ep;
+	}
+	return options;
+}
+
+/* Parse guestdev parameter */
+static int __init pci_parse_guestdev(void)
+{
+	int len;
+	char *sp, *ep, *op;
+	int options;
+	struct list_head *head;
+	struct guestdev *gdev;
+	char path_str[GUESTDEV_STR_MAX];
+	int ret_val = 0;
+
+	len = strlen(guestdev_param);
+	if (len == 0)
+		return 0;
+
+	sp = guestdev_param;
+
+	do {
+		ep = strchr(sp, ',');
+		/* Chop */
+		if (ep)
+			*ep = '\0';
+		options = 0;
+		op = strchr(sp, '+');
+		if (op && (!ep || op < ep)) {
+			options = pci_parse_options(op);
+			*op = '\0';	/* Chop */
+		}
+		ret_val = pci_make_sbdf_guestdev(sp, options);
+		if (ret_val == -EINVAL) {
+			if (pci_check_extended_guestdev_format(sp)) {
+				ret_val = pci_make_devicepath_guestdev(
+					sp, options);
+				if (ret_val && ret_val != -EINVAL)
+					break;
+			}
+		} else if (ret_val)
+			break;
+
+		if (ep)
+			ep++;
+		sp = ep;
+	} while (ep);
+
+	list_for_each(head, &guestdev_list) {
+		gdev = list_entry(head, struct guestdev, root_list);
+		pci_make_guestdev_str(gdev, path_str, GUESTDEV_STR_MAX);
+		printk(KERN_DEBUG
+			"PCI: %s has been reserved for guest domain.\n",
+			path_str);
+	}
+	return 0;
+}
+
+arch_initcall(pci_parse_guestdev);
+
+/* Get command line */
+static int __init pci_guestdev_setup(char *str)
+{
+	if (strlen(str) >= COMMAND_LINE_SIZE)
+		return 0;
+	strlcpy(guestdev_param, str, sizeof(guestdev_param));
+	return 1;
+}
+
+__setup("guestdev=", pci_guestdev_setup);
+
+/* Free sbdf and nodes */
+static void pci_free_sbdf(struct pcidev_sbdf *sbdf)
+{
+	struct pcidev_sbdf_node *node, *next;
+
+	node = sbdf->child;
+	while (node) {
+		next = node->child;
+		kfree(node);
+		node = next;
+	}
+	/* Skip kfree(sbdf) */
+}
+
+/* Does PCI device belong to sub tree specified by guestdev with device path? */
+typedef int (*pci_node_match_t)(const struct devicepath_node *gdev_node,
+				const struct pcidev_sbdf_node *sbdf_node,
+				int options);
+
+static int pci_node_match(const struct devicepath_node *gdev_node,
+			  const struct pcidev_sbdf_node *sbdf_node,
+			  int options_unused)
+{
+	return (gdev_node->dev == sbdf_node->dev &&
+		gdev_node->func == sbdf_node->func);
+}
+
+static int pci_is_in_devicepath_sub_tree(struct guestdev *gdev,
+					 struct pcidev_sbdf *sbdf,
+					 pci_node_match_t match)
+{
+	int seg, bbn;
+	struct devicepath_node *gdev_node;
+	struct pcidev_sbdf_node *sbdf_node;
+
+	if (!gdev || !sbdf)
+		return FALSE;
+
+	BUG_ON(!(gdev->flags & GUESTDEV_FLAG_DEVICEPATH));
+
+	/* Compare seg and bbn */
+	if (gdev->u.devicepath.seg == INVALID_SEG ||
+	    gdev->u.devicepath.bbn == INVALID_BBN) {
+		if (acpi_pci_get_root_seg_bbn(gdev->u.devicepath.hid,
+		    gdev->u.devicepath.uid, &seg, &bbn)) {
+			gdev->u.devicepath.seg = seg;
+			gdev->u.devicepath.bbn = bbn;
+		} else
+			return FALSE;
+	}
+
+	if (gdev->u.devicepath.seg != sbdf->seg ||
+	    gdev->u.devicepath.bbn != sbdf->bus)
+		return FALSE;
+
+	gdev_node = gdev->u.devicepath.child;
+	sbdf_node = sbdf->child;
+
+	/* Compare dev and func */
+	while (gdev_node) {
+		if (!sbdf_node)
+			return FALSE;
+		if (!match(gdev_node, sbdf_node, gdev->options))
+			return FALSE;
+		gdev_node = gdev_node->child;
+		sbdf_node = sbdf_node->child;
+	}
+	return TRUE;
+}
+
+/* Get sbdf from device */
+static int pci_get_sbdf_from_pcidev(
+	struct pci_dev *dev, struct pcidev_sbdf *sbdf)
+{
+	struct pcidev_sbdf_node *node;
+
+	if (!dev)
+		return FALSE;
+
+	for(;;) {
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		if (!node) {
+			pr_err("PCI: failed to allocate memory\n");
+			goto err_end;
+		}
+		node->dev = PCI_SLOT(dev->devfn);
+		node->func = PCI_FUNC(dev->devfn);
+
+		if (!sbdf->child)
+			sbdf->child = node;
+		else {
+			node->child = sbdf->child;
+			sbdf->child = node;
+		}
+		if (!dev->bus)
+			goto err_end;
+		if (!dev->bus->self)
+			break;
+		dev = dev->bus->self;
+	}
+	if (sscanf(dev_name(&dev->dev), "%04x:%02x", &sbdf->seg, &sbdf->bus) != 2)
+		goto err_end;
+	return TRUE;
+
+err_end:
+	pci_free_sbdf(sbdf);
+	return FALSE;
+}
+
+/* Does PCI device belong to sub tree specified by guestdev with sbdf? */
+typedef int (*pci_sbdf_match_t)(const struct guestdev *gdev,
+				const  struct pci_dev *dev);
+
+static int pci_sbdf_match(const struct guestdev *gdev,
+			  const struct pci_dev *dev)
+{
+	int seg, bus;
+
+	if (sscanf(dev_name(&dev->dev), "%04x:%02x", &seg, &bus) != 2)
+		return FALSE;
+
+	return gdev->u.sbdf.seg == seg &&
+		gdev->u.sbdf.bus == bus &&
+		gdev->u.sbdf.dev == PCI_SLOT(dev->devfn) &&
+		gdev->u.sbdf.func == PCI_FUNC(dev->devfn);
+}
+
+static int pci_is_in_sbdf_sub_tree(struct guestdev *gdev, struct pci_dev *dev,
+				   pci_sbdf_match_t match)
+{
+	BUG_ON(!(gdev->flags & GUESTDEV_FLAG_SBDF));
+	for (;;) {
+		if (match(gdev, dev))
+			return TRUE;
+		if (!dev->bus || !dev->bus->self)
+			break;
+		dev = dev->bus->self;
+	}
+	return FALSE;
+}
+
+/* Does PCI device belong to sub tree specified by guestdev parameter? */
+static int __pci_is_guestdev(struct pci_dev *dev, pci_node_match_t node_match,
+			     pci_sbdf_match_t sbdf_match)
+{
+	struct guestdev *gdev;
+	struct pcidev_sbdf pcidev_sbdf, *sbdf = NULL;
+	struct list_head *head;
+	int result = FALSE;
+
+	if (!dev)
+		return FALSE;
+
+	list_for_each(head, &guestdev_list) {
+		gdev = list_entry(head, struct guestdev, root_list);
+		switch (gdev->flags & GUESTDEV_FLAG_TYPE_MASK) {
+		case GUESTDEV_FLAG_DEVICEPATH:
+			if (sbdf == NULL) {
+				sbdf = &pcidev_sbdf;
+				memset(sbdf, 0 ,sizeof(*sbdf));
+				if (!pci_get_sbdf_from_pcidev(dev, sbdf))
+					goto out;
+			}
+			if (pci_is_in_devicepath_sub_tree(gdev, sbdf,
+							  node_match)) {
+				result = TRUE;
+				goto out;
+			}
+			break;
+		case GUESTDEV_FLAG_SBDF:
+			if (pci_is_in_sbdf_sub_tree(gdev, dev, sbdf_match)) {
+				result = TRUE;
+				goto out;
+			}
+			break;
+		default:
+			BUG();
+		}
+	}
+out:
+	if (sbdf)
+		pci_free_sbdf(sbdf);
+	return result;
+}
+
+int pci_is_guestdev(struct pci_dev *dev)
+{
+	return __pci_is_guestdev(dev, pci_node_match, pci_sbdf_match);
+}
+EXPORT_SYMBOL_GPL(pci_is_guestdev);
+
+static int reassign_resources;
+
+static int __init pci_set_reassign_resources(char *str)
+{
+	if (str && !strcmp(str, "all"))
+		reassign_resources = -1;
+	else
+		reassign_resources = 1;
+
+	return 1;
+}
+__setup("reassign_resources", pci_set_reassign_resources);
+
+int pci_is_guestdev_to_reassign(struct pci_dev *dev)
+{
+	if (reassign_resources < 0)
+		return TRUE;
+	if (reassign_resources)
+		return pci_is_guestdev(dev);
+	return FALSE;
+}
+
+#if defined(CONFIG_PCI_IOMULTI) || defined(CONFIG_PCI_IOMULTI_MODULE)
+static int pci_iomul_node_match(const struct devicepath_node *gdev_node,
+				const struct pcidev_sbdf_node *sbdf_node,
+				int options)
+{
+	return (options & GUESTDEV_OPT_IOMUL) &&
+		((gdev_node->child != NULL &&
+		  sbdf_node->child != NULL &&
+		  gdev_node->dev == sbdf_node->dev &&
+		  gdev_node->func == sbdf_node->func) ||
+		 (gdev_node->child == NULL &&
+		  sbdf_node->child == NULL &&
+		  gdev_node->dev == sbdf_node->dev));
+}
+
+static int pci_iomul_sbdf_match(const struct guestdev *gdev,
+				const struct pci_dev *dev)
+{
+	int seg, bus;
+
+	if (sscanf(dev_name(&dev->dev), "%04x:%02x", &seg, &bus) != 2)
+		return FALSE;
+
+	return (gdev->options & GUESTDEV_OPT_IOMUL) &&
+		gdev->u.sbdf.seg == seg &&
+		gdev->u.sbdf.bus == bus &&
+		gdev->u.sbdf.dev == PCI_SLOT(dev->devfn);
+}
+
+int pci_is_iomuldev(struct pci_dev *dev)
+{
+	return __pci_is_guestdev(dev,
+				 pci_iomul_node_match, pci_iomul_sbdf_match);
+}
+#endif /* CONFIG_PCI_IOMULTI */
+
+/* Check whether the devicepath exists under the pci root bus */
+static int __init pci_check_devicepath_exists(
+		struct guestdev *gdev, struct pci_bus *bus)
+{
+	struct devicepath_node *node;
+	struct pci_dev *dev;
+
+	BUG_ON(!(gdev->flags & GUESTDEV_FLAG_DEVICEPATH));
+
+	node = gdev->u.devicepath.child;
+	while (node) {
+		if (!bus)
+			return FALSE;
+		dev = pci_get_slot(bus, PCI_DEVFN(node->dev, node->func));
+		if (!dev)
+			return FALSE;
+		bus = dev->subordinate;
+		node = node->child;
+		pci_dev_put(dev);
+	}
+	return TRUE;
+}
+
+/* Check whether the guestdev exists in the PCI device tree */
+static int __init pci_check_guestdev_exists(void)
+{
+	struct list_head *head;
+	struct guestdev *gdev;
+	int seg, bbn;
+	struct pci_bus *bus;
+	struct pci_dev *dev;
+	char path_str[GUESTDEV_STR_MAX];
+
+	list_for_each(head, &guestdev_list) {
+		gdev = list_entry(head, struct guestdev, root_list);
+		switch (gdev->flags & GUESTDEV_FLAG_TYPE_MASK) {
+		case GUESTDEV_FLAG_DEVICEPATH:
+			if (gdev->u.devicepath.seg == INVALID_SEG ||
+				gdev->u.devicepath.bbn == INVALID_BBN) {
+				if (acpi_pci_get_root_seg_bbn(
+					gdev->u.devicepath.hid,
+					gdev->u.devicepath.uid, &seg, &bbn)) {
+					gdev->u.devicepath.seg = seg;
+					gdev->u.devicepath.bbn = bbn;
+				} else {
+					pci_make_guestdev_str(gdev,
+						path_str, GUESTDEV_STR_MAX);
+					pr_info("PCI: "
+						"device %s does not exist\n",
+						path_str);
+					continue;
+				}
+			}
+
+			bus = pci_find_bus(gdev->u.devicepath.seg,
+						gdev->u.devicepath.bbn);
+			if (!bus || !pci_check_devicepath_exists(gdev, bus)) {
+				pci_make_guestdev_str(gdev, path_str,
+					GUESTDEV_STR_MAX);
+				pr_info("PCI: device %s does not exist\n",
+					path_str);
+			}
+			break;
+		case GUESTDEV_FLAG_SBDF:
+			bus = pci_find_bus(gdev->u.sbdf.seg, gdev->u.sbdf.bus);
+			if (bus) {
+				dev = pci_get_slot(bus,
+					PCI_DEVFN(gdev->u.sbdf.dev,
+							gdev->u.sbdf.func));
+				if (dev) {
+					pci_dev_put(dev);
+					continue;
+				}
+			}
+			pci_make_guestdev_str(gdev, path_str, GUESTDEV_STR_MAX);
+			pr_info("PCI: device %s does not exist\n", path_str);
+			break;
+		default:
+			BUG();
+		}
+	}
+	return 0;
+}
+
+fs_initcall(pci_check_guestdev_exists);
+
diff --git a/drivers/pci/iomulti.c b/drivers/pci/iomulti.c
new file mode 100644
index 0000000..9b87951
--- /dev/null
+++ b/drivers/pci/iomulti.c
@@ -0,0 +1,904 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include "iomulti.h"
+#include "pci.h"
+#include <linux/export.h>
+#include <linux/sort.h>
+#include <asm/setup.h>
+
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+#define __pcihp_init __devinit
+#else
+#define __pcihp_init __init
+#endif
+
+#define PCI_BUS_MAX		255
+#define PCI_DEV_MAX		31
+
+/* see pci_resource_len */
+static inline resource_size_t __pcihp_init pci_iomul_len(
+	const struct resource* r)
+{
+	if (!r->start && r->start == r->end)
+		return 0;
+	return r->end - r->start + 1;
+}
+
+#define ROUND_UP(x, a)		(((x) + (a) - 1) & ~((a) - 1))
+/* stolen from pbus_size_io() */
+static unsigned long __devinit pdev_size_io(struct pci_dev *pdev)
+{
+	unsigned long size = 0, size1 = 0;
+	int i;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		struct resource *r = &pdev->resource[i];
+		unsigned long r_size;
+
+		if (!(r->flags & IORESOURCE_IO))
+			continue;
+
+		r_size = r->end - r->start + 1;
+
+		if (r_size < 0x400)
+			/* Might be re-aligned for ISA */
+			size += r_size;
+		else
+			size1 += r_size;
+	}
+
+/* To be fixed in 2.5: we should have sort of HAVE_ISA
+   flag in the struct pci_bus. */
+#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
+	size = (size & 0xff) + ((size & ~0xffUL) << 2);
+#endif
+	size = ROUND_UP(size + size1, 4096);
+	return size;
+}
+
+/*
+ * primary bus number of PCI-PCI bridge in switch on which
+ * this slots sits.
+ * i.e. the primary bus number of PCI-PCI bridge of downstream port
+ *      or root port in switch.
+ *      the secondary bus number of PCI-PCI bridge of upstream port
+ *      in switch.
+ */
+static inline unsigned char pci_dev_switch_busnr(struct pci_dev *pdev)
+{
+	if (pci_find_capability(pdev, PCI_CAP_ID_EXP))
+		return pdev->bus->primary;
+	return pdev->bus->number;
+}
+
+static LIST_HEAD(switch_list);
+static DEFINE_MUTEX(switch_list_lock);
+
+/*****************************************************************************/
+int pci_iomul_switch_io_allocated(const struct pci_iomul_switch *sw)
+{
+	return sw->io_base && sw->io_base <= sw->io_limit;
+}
+EXPORT_SYMBOL_GPL(pci_iomul_switch_io_allocated);
+
+static struct pci_iomul_switch *pci_iomul_find_switch_locked(int segment,
+							     uint8_t bus)
+{
+	struct pci_iomul_switch *sw;
+
+	BUG_ON(!mutex_is_locked(&switch_list_lock));
+	list_for_each_entry(sw, &switch_list, list) {
+		if (sw->segment == segment && sw->bus == bus)
+			return sw;
+	}
+	return NULL;
+}
+
+static struct pci_iomul_slot *pci_iomul_find_slot_locked(
+	struct pci_iomul_switch *sw, uint8_t busnr, uint8_t dev)
+{
+	struct pci_iomul_slot *slot;
+
+	BUG_ON(!mutex_is_locked(&sw->lock));
+	list_for_each_entry(slot, &sw->slots, sibling) {
+		if (slot->bus == busnr && slot->dev == dev)
+			return slot;
+	}
+	return NULL;
+}
+
+/* on successfull exit, sw->lock is locked for use slot and
+ * refrence count of sw is incremented.
+ */
+void pci_iomul_get_lock_switch(struct pci_dev *pdev,
+			       struct pci_iomul_switch **swp,
+			       struct pci_iomul_slot **slot)
+{
+	mutex_lock(&switch_list_lock);
+
+	*swp = pci_iomul_find_switch_locked(pci_domain_nr(pdev->bus),
+					    pci_dev_switch_busnr(pdev));
+	if (!*swp) {
+		*slot = NULL;
+		goto out;
+	}
+
+	mutex_lock(&(*swp)->lock);
+	*slot = pci_iomul_find_slot_locked(*swp, pdev->bus->number,
+					   PCI_SLOT(pdev->devfn));
+	if (!*slot) {
+		mutex_unlock(&(*swp)->lock);
+		*swp = NULL;
+	} else {
+		pci_iomul_switch_get(*swp);
+	}
+out:
+	mutex_unlock(&switch_list_lock);
+}
+EXPORT_SYMBOL_GPL(pci_iomul_get_lock_switch);
+
+static struct pci_iomul_switch *__devinit pci_iomul_switch_alloc(int segment,
+								 uint8_t bus)
+{
+	struct pci_iomul_switch *sw;
+
+	BUG_ON(!mutex_is_locked(&switch_list_lock));
+
+	sw = kmalloc(sizeof(*sw), GFP_KERNEL);
+
+	mutex_init(&sw->lock);
+	kref_init(&sw->kref);
+	sw->io_region = NULL;
+	sw->count = 0;
+	sw->current_pdev = NULL;
+	sw->segment = segment;
+	sw->bus = bus;
+	sw->io_base = 0;
+	sw->io_limit = 0;
+	sw->func = NULL;
+	INIT_LIST_HEAD(&sw->slots);
+
+	return sw;
+}
+
+static void __devinit pci_iomul_switch_add_locked(struct pci_iomul_switch *sw)
+{
+	BUG_ON(!mutex_is_locked(&switch_list_lock));
+	list_add(&sw->list, &switch_list);
+}
+
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+static void __devinit pci_iomul_switch_del_locked(struct pci_iomul_switch *sw)
+{
+	BUG_ON(!mutex_is_locked(&switch_list_lock));
+	list_del(&sw->list);
+}
+#endif
+
+static int __devinit pci_iomul_slot_init(struct pci_dev *pdev,
+					 struct pci_iomul_slot *slot)
+{
+	u16 rpcap;
+	u16 cap;
+
+	rpcap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (!rpcap) {
+		/* pci device isn't supported */
+		pr_info("PCI: sharing io port of non PCIe device %s "
+			"isn't supported. ignoring.\n",
+			pci_name(pdev));
+		return -ENOSYS;
+	}
+
+	pci_read_config_word(pdev, rpcap + PCI_CAP_FLAGS, &cap);
+	switch ((cap & PCI_EXP_FLAGS_TYPE) >> 4) {
+	case PCI_EXP_TYPE_RC_END:
+		pr_info("PCI: io port sharing of root complex integrated "
+			"endpoint %s isn't supported. ignoring.\n",
+			pci_name(pdev));
+		return -ENOSYS;
+	case PCI_EXP_TYPE_ENDPOINT:
+	case PCI_EXP_TYPE_LEG_END:
+		break;
+	default:
+		pr_info("PCI: io port sharing of non endpoint %s "
+			"doesn't make sense. ignoring.\n",
+			pci_name(pdev));
+		return -EINVAL;
+	}
+
+	kref_init(&slot->kref);
+	slot->switch_busnr = pci_dev_switch_busnr(pdev);
+	slot->segment = pci_domain_nr(pdev->bus);
+	slot->bus = pdev->bus->number;
+	slot->dev = PCI_SLOT(pdev->devfn);
+
+	return 0;
+}
+
+static struct pci_iomul_slot *__devinit pci_iomul_slot_alloc(
+	struct pci_dev *pdev)
+{
+	struct pci_iomul_slot *slot;
+
+	slot = kzalloc(sizeof(*slot), GFP_KERNEL);
+	if (!slot)
+		return NULL;
+
+	if (pci_iomul_slot_init(pdev, slot)) {
+		kfree(slot);
+		return NULL;
+	}
+	return slot;
+}
+
+static void __devinit pci_iomul_slot_add_locked(struct pci_iomul_switch *sw,
+						struct pci_iomul_slot *slot)
+{
+	BUG_ON(!mutex_is_locked(&sw->lock));
+	list_add(&slot->sibling, &sw->slots);
+}
+
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+static void __devinit pci_iomul_slot_del_locked(struct pci_iomul_switch *sw,
+						struct pci_iomul_slot *slot)
+{
+	BUG_ON(!mutex_is_locked(&sw->lock));
+	list_del(&slot->sibling);
+}
+#endif
+
+/*****************************************************************************/
+static int __devinit pci_get_sbd(const char *str, int *segment__,
+				 uint8_t *bus__, uint8_t *dev__)
+{
+	int segment;
+	int bus;
+	int dev;
+
+	if (sscanf(str, "%x:%x:%x", &segment, &bus, &dev) != 3) {
+		if (sscanf(str, "%x:%x", &bus, &dev) == 2)
+			segment = 0;
+		else
+			return -EINVAL;
+	}
+
+	if (segment < 0 || INT_MAX <= segment)
+		return -EINVAL;
+	if (bus < 0 || PCI_BUS_MAX < bus)
+		return -EINVAL;
+	if (dev < 0 || PCI_DEV_MAX < dev)
+		return -EINVAL;
+
+	*segment__ = segment;
+	*bus__ = bus;
+	*dev__ = dev;
+	return 0;
+}
+
+static char __devinitdata iomul_param[COMMAND_LINE_SIZE];
+#define TOKEN_MAX	10	/* SSSS:BB:DD length is 10 */
+static int __devinit pci_is_iomul_dev_param(struct pci_dev *pdev)
+{
+	int len;
+	char *p;
+	char *next_str;
+
+	if (!strcmp(iomul_param, "all"))
+		return 1;
+	for (p = &iomul_param[0]; *p != '\0'; p = next_str + 1) {
+		next_str = strchr(p, ',');
+		if (next_str)
+			len = next_str - p;
+		else
+			len = strlen(p);
+
+		if (len > 0 && len <= TOKEN_MAX) {
+			char tmp[TOKEN_MAX+1];
+			int seg;
+			uint8_t bus;
+			uint8_t dev;
+
+			strlcpy(tmp, p, len);
+			if (!pci_get_sbd(tmp, &seg, &bus, &dev) &&
+			    pci_domain_nr(pdev->bus) == seg &&
+			    pdev->bus->number == bus &&
+			    PCI_SLOT(pdev->devfn) == dev)
+				return 1;
+		}
+		if (!next_str)
+			break;
+	}
+
+	/* check guestdev=<device>+iomul option */
+	return pci_is_iomuldev(pdev);
+}
+
+/*
+ * Format: [<segment>:]<bus>:<dev>[,[<segment>:]<bus>:<dev>[,...]
+ */
+static int __init pci_iomul_param_setup(char *str)
+{
+	if (!is_initial_xendomain() || strlen(str) >= COMMAND_LINE_SIZE)
+		return 0;
+
+	/* parse it after pci bus scanning */
+	strlcpy(iomul_param, str, sizeof(iomul_param));
+	return 1;
+}
+__setup("guestiomuldev=", pci_iomul_param_setup);
+
+/*****************************************************************************/
+static void __devinit pci_iomul_set_bridge_io_window(struct pci_dev *bridge,
+						     uint32_t io_base,
+						     uint32_t io_limit)
+{
+	uint16_t l;
+	uint32_t upper16;
+
+	io_base >>= 12;
+	io_base <<= 4;
+	io_limit >>= 12;
+	io_limit <<= 4;
+	l = (io_base & 0xff) | ((io_limit & 0xff) << 8);
+	upper16 = ((io_base & 0xffff00) >> 8) |
+		(((io_limit & 0xffff00) >> 8) << 16);
+
+	/* Temporarily disable the I/O range before updating PCI_IO_BASE. */
+	pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, 0x0000ffff);
+	/* Update lower 16 bits of I/O base/limit. */
+	pci_write_config_word(bridge, PCI_IO_BASE, l);
+	/* Update upper 16 bits of I/O base/limit. */
+	pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, upper16);
+}
+
+static void __devinit pci_disable_bridge_io_window(struct pci_dev *bridge)
+{
+	/* set base = 0xffffff limit = 0x0 */
+	pci_iomul_set_bridge_io_window(bridge, 0xffffff, 0);
+}
+
+static int __devinit pci_iomul_func_scan(struct pci_dev *pdev,
+					 struct pci_iomul_slot *slot,
+					 uint8_t func)
+{
+	struct pci_iomul_func *f;
+	unsigned int i;
+
+	f = kzalloc(sizeof(*f), GFP_KERNEL);
+	if (!f)
+		return -ENOMEM;
+
+	f->segment = slot->segment;
+	f->bus = slot->bus;
+	f->devfn = PCI_DEVFN(slot->dev, func);
+	f->io_size = pdev_size_io(pdev);
+
+	for (i = 0; i < PCI_NUM_BARS; i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_IO))
+			continue;
+		if (!pci_resource_len(pdev, i))
+			continue;
+
+		f->io_bar |= 1 << i;
+		f->resource[i] = pdev->resource[i];
+	}
+
+	if (f->io_bar)
+		slot->func[func] = f;
+	else
+		kfree(f);
+	return 0;
+}
+
+/*
+ * This is tricky part.
+ * fake PCI resource assignment routines by setting flags to 0.
+ * PCI resource allocate routines think the resource should
+ * be allocated by checking flags. 0 means this resource isn't used.
+ * See pbus_size_io() and pdev_sort_resources().
+ *
+ * After allocated resources, flags (IORESOURCE_IO) is exported
+ * to other part including user process.
+ * So we have to set flags to IORESOURCE_IO, but at the same time
+ * we must prevent those resources from reassigning when pci hot plug.
+ * To achieve that, set r->parent to dummy resource.
+ */
+static inline void __devinit pci_iomul_disable_resource(struct resource *r)
+{
+	/* don't allocate this resource */
+	r->flags = 0;
+}
+
+static void __pcihp_init pci_iomul_reenable_resource(
+	struct resource *dummy_parent, struct resource *r)
+{
+	int ret;
+
+	dummy_parent->start = r->start;
+	dummy_parent->end = r->end;
+	dummy_parent->flags = r->flags;
+	dummy_parent->name = "PCI IOMUL dummy resource";
+
+	ret = request_resource(dummy_parent, r);
+	BUG_ON(ret);
+}
+
+static void __devinit pci_iomul_fixup_ioresource(struct pci_dev *pdev,
+						 struct pci_iomul_func *func,
+						 int reassign, int dealloc)
+{
+	uint8_t i;
+	struct resource *r;
+
+	pr_info("PCI: deallocating io resource[%s]. io size 0x%lx\n",
+		pci_name(pdev), func->io_size);
+	for (i = 0; i < PCI_NUM_BARS; i++) {
+		r = &pdev->resource[i];
+		if (!(func->io_bar & (1 << i)))
+			continue;
+
+		if (reassign) {
+			r->end -= r->start;
+			r->start = 0;
+			pci_update_resource(pdev, i);
+			func->resource[i] = *r;
+		}
+
+		if (dealloc)
+			/* don't allocate this resource */
+			pci_iomul_disable_resource(r);
+	}
+
+	/* parent PCI-PCI bridge */
+	if (!reassign)
+		return;
+	pdev = pdev->bus->self;
+	if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+		return;
+	pci_disable_bridge_io_window(pdev);
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		r = &pdev->resource[i];
+		if (!(r->flags & IORESOURCE_IO))
+			continue;
+
+		r->end -= r->start;
+		r->start = 0;
+		if (i < PCI_BRIDGE_RESOURCES)
+			pci_update_resource(pdev, i);
+	}
+}
+
+static void __devinit __quirk_iomul_dealloc_ioresource(
+	struct pci_iomul_switch *sw,
+	struct pci_dev *pdev, struct pci_iomul_slot *slot)
+{
+	struct pci_iomul_func *f;
+	struct pci_iomul_func *__f;
+
+	if (pci_iomul_func_scan(pdev, slot, PCI_FUNC(pdev->devfn)))
+		return;
+
+	f = slot->func[PCI_FUNC(pdev->devfn)];
+	if (!f)
+		return;
+
+	__f = sw->func;
+	/* sw->io_base == 0 means that we are called at boot time.
+	 * != 0 means that we are called by php after boot. */
+	if (!sw->io_base && (!__f || __f->io_size < f->io_size)) {
+		if (__f) {
+			struct pci_bus *__pbus;
+			struct pci_dev *__pdev;
+
+			__pbus = pci_find_bus(__f->segment, __f->bus);
+			BUG_ON(!__pbus);
+			__pdev = pci_get_slot(__pbus, __f->devfn);
+			BUG_ON(!__pdev);
+			pci_iomul_fixup_ioresource(__pdev, __f, 0, 1);
+			pci_dev_put(__pdev);
+		}
+
+		pci_iomul_fixup_ioresource(pdev, f, 1, 0);
+		sw->func = f;
+	} else {
+		pci_iomul_fixup_ioresource(pdev, f, 1, 1);
+	}
+}
+
+static void __devinit quirk_iomul_dealloc_ioresource(struct pci_dev *pdev)
+{
+	struct pci_iomul_switch *sw;
+	struct pci_iomul_slot *slot;
+
+	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+		return;
+	if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+		return;	/* PCI Host Bridge isn't a target device */
+	if (!pci_is_iomul_dev_param(pdev))
+		return;
+
+	mutex_lock(&switch_list_lock);
+	sw = pci_iomul_find_switch_locked(pci_domain_nr(pdev->bus),
+					  pci_dev_switch_busnr(pdev));
+	if (!sw) {
+		sw = pci_iomul_switch_alloc(pci_domain_nr(pdev->bus),
+					    pci_dev_switch_busnr(pdev));
+		if (!sw) {
+			mutex_unlock(&switch_list_lock);
+			pr_warn("PCI: can't allocate memory "
+				"for sw of IO multiplexing %s",
+				pci_name(pdev));
+			return;
+		}
+		pci_iomul_switch_add_locked(sw);
+	}
+	pci_iomul_switch_get(sw);
+	mutex_unlock(&switch_list_lock);
+
+	mutex_lock(&sw->lock);
+	slot = pci_iomul_find_slot_locked(sw, pdev->bus->number,
+					  PCI_SLOT(pdev->devfn));
+	if (!slot) {
+		slot = pci_iomul_slot_alloc(pdev);
+		if (!slot) {
+			mutex_unlock(&sw->lock);
+			pci_iomul_switch_put(sw);
+			pr_warn("PCI: can't allocate memory "
+				"for IO multiplexing %s", pci_name(pdev));
+			return;
+		}
+		pci_iomul_slot_add_locked(sw, slot);
+	}
+
+	pr_info("PCI: disable device and release io resource[%s].\n",
+		pci_name(pdev));
+	pci_disable_device(pdev);
+
+	__quirk_iomul_dealloc_ioresource(sw, pdev, slot);
+
+	mutex_unlock(&sw->lock);
+	pci_iomul_switch_put(sw);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID,
+			 quirk_iomul_dealloc_ioresource);
+
+static void __pcihp_init pci_iomul_read_bridge_io(struct pci_iomul_switch *sw)
+{
+	struct pci_iomul_func *f = sw->func;
+
+	struct pci_bus *pbus;
+	struct pci_dev *pdev;
+	struct pci_dev *bridge;
+
+	uint16_t l;
+	uint16_t base_upper16;
+	uint16_t limit_upper16;
+	uint32_t io_base;
+	uint32_t io_limit;
+
+	pbus = pci_find_bus(f->segment, f->bus);
+	BUG_ON(!pbus);
+
+	pdev = pci_get_slot(pbus, f->devfn);
+	BUG_ON(!pdev);
+
+	bridge = pdev->bus->self;
+	pci_read_config_word(bridge, PCI_IO_BASE, &l);
+	pci_read_config_word(bridge, PCI_IO_BASE_UPPER16, &base_upper16);
+	pci_read_config_word(bridge, PCI_IO_LIMIT_UPPER16, &limit_upper16);
+
+	io_base = (l & 0xf0) | ((uint32_t)base_upper16 << 8);
+	io_base <<= 8;
+	io_limit = (l >> 8) | ((uint32_t)limit_upper16 << 8);
+	io_limit <<= 8;
+	io_limit |= 0xfff;
+
+	sw->io_base = io_base;
+	sw->io_limit = io_limit;
+
+	pci_dev_put(pdev);
+	pr_info("PCI: bridge %s base 0x%x limit 0x%x\n",
+		pci_name(bridge), sw->io_base, sw->io_limit);
+}
+
+static void __pcihp_init pci_iomul_setup_brige(struct pci_dev *bridge,
+					       uint32_t io_base,
+					       uint32_t io_limit)
+{
+	uint16_t cmd;
+
+	if ((bridge->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+		return;
+
+	pci_iomul_set_bridge_io_window(bridge, io_base, io_limit);
+
+	/* and forcibly enables IO */
+	pci_read_config_word(bridge, PCI_COMMAND, &cmd);
+	if (!(cmd & PCI_COMMAND_IO)) {
+		cmd |= PCI_COMMAND_IO;
+		pr_info("PCI: forcibly enabling IO %s\n", pci_name(bridge));
+		pci_write_config_word(bridge, PCI_COMMAND, cmd);
+	}
+}
+
+struct __bar {
+	unsigned long size;
+	uint8_t bar;
+};
+
+/* decending order */
+static int __pcihp_init pci_iomul_bar_cmp(const void *lhs__, const void *rhs__)
+{
+	const struct __bar *lhs = (struct __bar*)lhs__;
+	const struct __bar *rhs = (struct __bar*)rhs__;
+	return - (lhs->size - rhs->size);
+}
+
+static void __pcihp_init pci_iomul_setup_dev(struct pci_dev *pdev,
+					     struct pci_iomul_func *f,
+					     uint32_t io_base)
+{
+	struct __bar bars[PCI_NUM_BARS];
+	int i;
+	uint8_t num_bars = 0;
+	struct resource *r;
+
+	pr_info("PCI: Forcibly assign IO %s from 0x%x\n",
+		pci_name(pdev), io_base);
+
+	for (i = 0; i < PCI_NUM_BARS; i++) {
+		if (!(f->io_bar & (1 << i)))
+			continue;
+
+		r = &f->resource[i];
+		bars[num_bars].size = pci_iomul_len(r);
+		bars[num_bars].bar = i;
+
+		num_bars++;
+	}
+
+	sort(bars, num_bars, sizeof(bars[0]), &pci_iomul_bar_cmp, NULL);
+
+	for (i = 0; i < num_bars; i++) {
+		struct resource *fr = &f->resource[bars[i].bar];
+		r = &pdev->resource[bars[i].bar];
+
+		BUG_ON(r->start);
+		r->start += io_base;
+		r->end += io_base;
+
+		fr->start = r->start;
+		fr->end = r->end;
+
+		/* pci_update_resource() check flags. */
+		r->flags = fr->flags;
+		pci_update_resource(pdev, bars[i].bar);
+		pci_iomul_reenable_resource(&f->dummy_parent, r);
+
+		io_base += bars[i].size;
+	}
+}
+
+static void __pcihp_init pci_iomul_release_io_resource(
+	struct pci_dev *pdev, struct pci_iomul_switch *sw,
+	struct pci_iomul_slot *slot, struct pci_iomul_func *f)
+{
+	int i;
+	struct resource *r;
+
+	for (i = 0; i < PCI_NUM_BARS; i++) {
+		if ((pci_resource_flags(pdev, i) & IORESOURCE_IO) &&
+		    pdev->resource[i].parent) {
+			r = &pdev->resource[i];
+			f->resource[i] = *r;
+			release_resource(r);
+			pci_iomul_reenable_resource(&f->dummy_parent, r);
+		}
+	}
+
+	/* parent PCI-PCI bridge */
+	pdev = pdev->bus->self;
+	if ((pdev->class >> 8) != PCI_CLASS_BRIDGE_HOST) {
+		for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *parent = pdev->resource[i].parent;
+
+			if (!(pci_resource_flags(pdev, i) & IORESOURCE_IO) ||
+			    !parent)
+				continue;
+
+			r = &pdev->resource[i];
+
+			sw->io_resource.flags = r->flags;
+			sw->io_resource.start = sw->io_base;
+			sw->io_resource.end = sw->io_limit;
+			sw->io_resource.name = "PCI IO Multiplexer";
+
+			release_resource(r);
+			pci_iomul_reenable_resource(
+				&slot->dummy_parent[i - PCI_BRIDGE_RESOURCES],
+				r);
+
+			if (request_resource(parent, &sw->io_resource))
+				pr_err("PCI IOMul: can't allocate "
+				       "resource. [0x%x, 0x%x]",
+				       sw->io_base, sw->io_limit);
+		}
+	}
+}
+
+static void __pcihp_init quirk_iomul_reassign_ioresource(struct pci_dev *pdev)
+{
+	struct pci_iomul_switch *sw;
+	struct pci_iomul_slot *slot;
+	struct pci_iomul_func *sf;
+	struct pci_iomul_func *f;
+
+	pci_iomul_get_lock_switch(pdev, &sw, &slot);
+	if (!sw || !slot)
+		return;
+
+	if (!sw->io_base)
+		pci_iomul_read_bridge_io(sw);
+	if (!pci_iomul_switch_io_allocated(sw))
+		goto out;
+
+	sf = sw->func;
+	f = slot->func[PCI_FUNC(pdev->devfn)];
+	if (!f)
+		/*
+		 * (!sf || !f) case can happen when all the
+		 * specified devices don't have io space
+		 */
+		goto out;
+
+	if (sf &&
+	    (pci_domain_nr(pdev->bus) != sf->segment ||
+	     pdev->bus->number != sf->bus ||
+	     PCI_SLOT(pdev->devfn) != PCI_SLOT(sf->devfn)) &&
+	    !PCI_FUNC(pdev->devfn)) {
+		pci_iomul_setup_brige(pdev->bus->self,
+				      sw->io_base, sw->io_limit);
+	}
+
+	BUG_ON(f->io_size > sw->io_limit - sw->io_base + 1);
+	if (/* f == sf */ sf &&
+	    pci_domain_nr(pdev->bus) == sf->segment &&
+	    pdev->bus->number == sf->bus &&
+	    pdev->devfn == sf->devfn)
+		pci_iomul_release_io_resource(pdev, sw, slot, f);
+	else
+		pci_iomul_setup_dev(pdev, f, sw->io_base);
+
+out:
+	mutex_unlock(&sw->lock);
+	pci_iomul_switch_put(sw);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID,
+			quirk_iomul_reassign_ioresource);
+
+/*****************************************************************************/
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+static int __devinit __pci_iomul_notifier_del_device(struct pci_dev *pdev)
+{
+	struct pci_iomul_switch *sw;
+	struct pci_iomul_slot *slot;
+	int i;
+
+	pci_iomul_get_lock_switch(pdev, &sw, &slot);
+	if (!sw || !slot)
+		return 0;
+
+	if (sw->func == slot->func[PCI_FUNC(pdev->devfn)])
+		sw->func = NULL;
+	kfree(slot->func[PCI_FUNC(pdev->devfn)]);
+	slot->func[PCI_FUNC(pdev->devfn)] = NULL;
+	for (i = 0; i < PCI_NUM_FUNC; i++) {
+		if (slot->func[i])
+			goto out;
+	}
+
+	pci_iomul_slot_del_locked(sw, slot);
+	pci_iomul_slot_put(slot);
+
+out:
+	mutex_unlock(&sw->lock);
+	pci_iomul_switch_put(sw);
+	return 0;
+}
+
+static int __devinit __pci_iomul_notifier_del_switch(struct pci_dev *pdev)
+{
+	struct pci_iomul_switch *sw;
+
+	mutex_lock(&switch_list_lock);
+	sw = pci_iomul_find_switch_locked(pci_domain_nr(pdev->bus),
+					  pdev->bus->number);
+	if (!sw)
+		goto out;
+
+	pci_iomul_switch_del_locked(sw);
+
+	mutex_lock(&sw->lock);
+	if (sw->io_resource.parent)
+		release_resource(&sw->io_resource);
+	sw->io_base = 0;	/* to tell this switch is removed */
+	sw->io_limit = 0;
+	BUG_ON(!list_empty(&sw->slots));
+	mutex_unlock(&sw->lock);
+
+out:
+	mutex_unlock(&switch_list_lock);
+	pci_iomul_switch_put(sw);
+	return 0;
+}
+
+static int __devinit pci_iomul_notifier_del_device(struct pci_dev *pdev)
+{
+	int ret;
+	switch (pdev->hdr_type) {
+	case PCI_HEADER_TYPE_NORMAL:
+		ret = __pci_iomul_notifier_del_device(pdev);
+		break;
+	case PCI_HEADER_TYPE_BRIDGE:
+		ret = __pci_iomul_notifier_del_switch(pdev);
+		break;
+	default:
+		pr_warn("PCI IOMUL: device %s has unknown "
+			"header type %02x, ignoring.\n",
+			pci_name(pdev), pdev->hdr_type);
+		ret = -EIO;
+		break;
+	}
+	return ret;
+}
+
+static int __devinit pci_iomul_notifier(struct notifier_block *nb,
+					unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		quirk_iomul_reassign_ioresource(pdev);
+		break;
+	case BUS_NOTIFY_DEL_DEVICE:
+		pci_iomul_notifier_del_device(pdev);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block __devinitdata pci_iomul_nb = {
+	.notifier_call = pci_iomul_notifier,
+};
+
+static int __init pci_iomul_hotplug_init(void)
+{
+	if (!is_initial_xendomain())
+		return -ENODEV;
+
+	bus_register_notifier(&pci_bus_type, &pci_iomul_nb);
+	return 0;
+}
+late_initcall(pci_iomul_hotplug_init);
+#endif
diff --git a/drivers/pci/iomulti.h b/drivers/pci/iomulti.h
new file mode 100644
index 0000000..511ef5f
--- /dev/null
+++ b/drivers/pci/iomulti.h
@@ -0,0 +1,122 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+
+#define PCI_NUM_BARS		6
+#define PCI_NUM_FUNC		8
+
+struct pci_iomul_func {
+	int		segment;
+	uint8_t		bus;
+	uint8_t		devfn;
+
+	/* only start and end are used */
+	unsigned long	io_size;
+	uint8_t		io_bar;
+	struct resource	resource[PCI_NUM_BARS];
+	struct resource dummy_parent;
+};
+
+struct pci_iomul_switch {
+	struct list_head	list;	/* bus_list_lock protects */
+
+	/*
+	 * This lock the following entry and following
+	 * pci_iomul_slot/pci_iomul_func.
+	 */
+	struct mutex		lock;
+	struct kref		kref;
+
+	struct resource		io_resource;
+	struct resource		*io_region;
+	unsigned int		count;
+	struct pci_dev		*current_pdev;
+
+	int			segment;
+	uint8_t			bus;
+
+	uint32_t		io_base;
+	uint32_t		io_limit;
+
+	/* func which has the largeset io size*/
+	struct pci_iomul_func	*func;
+
+	struct list_head	slots;
+};
+
+static inline void pci_iomul_switch_get(struct pci_iomul_switch *sw)
+{
+	kref_get(&sw->kref);
+}
+
+static inline void pci_iomul_switch_release(struct kref *kref)
+{
+	struct pci_iomul_switch *sw = container_of(kref,
+						   struct pci_iomul_switch,
+						   kref);
+	kfree(sw);
+}
+
+static inline void pci_iomul_switch_put(struct pci_iomul_switch *sw)
+{
+	kref_put(&sw->kref, &pci_iomul_switch_release);
+}
+
+struct pci_iomul_slot {
+	struct list_head	sibling;
+	struct kref		kref;
+	/*
+	 * busnr
+	 * when pcie, the primary busnr of the PCI-PCI bridge on which
+	 * this devices sits.
+	 */
+	uint8_t			switch_busnr;
+	struct resource		dummy_parent[PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES];
+
+	/* device */
+	int			segment;
+	uint8_t			bus;
+	uint8_t			dev;
+
+	struct pci_iomul_func	*func[PCI_NUM_FUNC];
+};
+
+static inline void pci_iomul_slot_get(struct pci_iomul_slot *slot)
+{
+	kref_get(&slot->kref);
+}
+
+static inline void pci_iomul_slot_release(struct kref *kref)
+{
+	struct pci_iomul_slot *slot = container_of(kref, struct pci_iomul_slot,
+						   kref);
+	kfree(slot);
+}
+
+static inline void pci_iomul_slot_put(struct pci_iomul_slot *slot)
+{
+	kref_put(&slot->kref, &pci_iomul_slot_release);
+}
+
+int pci_iomul_switch_io_allocated(const struct pci_iomul_switch *);
+void pci_iomul_get_lock_switch(struct pci_dev *, struct pci_iomul_switch **,
+			       struct pci_iomul_slot **);
diff --git a/drivers/pci/msi-xen.c b/drivers/pci/msi-xen.c
new file mode 100644
index 0000000..185d993
--- /dev/null
+++ b/drivers/pci/msi-xen.c
@@ -0,0 +1,838 @@
+/*
+ * File:	msi.c
+ * Purpose:	PCI Message Signaled Interrupt (MSI)
+ *
+ * Copyright (C) 2003-2004 Intel
+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
+ */
+
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/msi.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#include <xen/evtchn.h>
+
+#include "pci.h"
+#include "msi.h"
+
+static int pci_msi_enable = 1;
+#if CONFIG_XEN_COMPAT < 0x040200
+static int pci_seg_supported = 1;
+#else
+#define pci_seg_supported 1
+#endif
+
+static LIST_HEAD(msi_dev_head);
+DEFINE_SPINLOCK(msi_dev_lock);
+
+struct msi_dev_list {
+	struct pci_dev *dev;
+	struct list_head list;
+	spinlock_t pirq_list_lock;
+	struct list_head pirq_list_head;
+	/* Store default pre-assigned irq */
+	unsigned int default_irq;
+};
+
+struct msi_pirq_entry {
+	struct list_head list;
+	int pirq;
+	int entry_nr;
+};
+
+/* Arch hooks */
+
+#ifndef arch_msi_check_device
+int arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
+{
+	return 0;
+}
+#endif
+
+static void msi_set_enable(struct pci_dev *dev, int pos, int enable)
+{
+	u16 control;
+
+	BUG_ON(!pos);
+
+	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
+	control &= ~PCI_MSI_FLAGS_ENABLE;
+	if (enable)
+		control |= PCI_MSI_FLAGS_ENABLE;
+	pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
+}
+
+static void msix_set_enable(struct pci_dev *dev, int enable)
+{
+	int pos;
+	u16 control;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (pos) {
+		pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
+		control &= ~PCI_MSIX_FLAGS_ENABLE;
+		if (enable)
+			control |= PCI_MSIX_FLAGS_ENABLE;
+		pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
+	}
+}
+
+static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev)
+{
+	struct msi_dev_list *msi_dev_list, *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&msi_dev_lock, flags);
+
+	list_for_each_entry(msi_dev_list, &msi_dev_head, list)
+		if ( msi_dev_list->dev == dev )
+			ret = msi_dev_list;
+
+	if ( ret ) {
+		spin_unlock_irqrestore(&msi_dev_lock, flags);
+		return ret;
+	}
+
+	/* Has not allocate msi_dev until now. */
+	ret = kzalloc(sizeof(struct msi_dev_list), GFP_ATOMIC);
+
+	/* Failed to allocate msi_dev structure */
+	if ( !ret ) {
+		spin_unlock_irqrestore(&msi_dev_lock, flags);
+		return NULL;
+	}
+
+	ret->dev = dev;
+	spin_lock_init(&ret->pirq_list_lock);
+	INIT_LIST_HEAD(&ret->pirq_list_head);
+	list_add_tail(&ret->list, &msi_dev_head);
+	spin_unlock_irqrestore(&msi_dev_lock, flags);
+	return ret;
+}
+
+static int attach_pirq_entry(int pirq, int entry_nr,
+                             struct msi_dev_list *msi_dev_entry)
+{
+	struct msi_pirq_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	unsigned long flags;
+
+	if (!entry)
+		return -ENOMEM;
+	entry->pirq = pirq;
+	entry->entry_nr = entry_nr;
+	spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
+	list_add_tail(&entry->list, &msi_dev_entry->pirq_list_head);
+	spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
+	return 0;
+}
+
+static void detach_pirq_entry(int entry_nr,
+ 							struct msi_dev_list *msi_dev_entry)
+{
+	unsigned long flags;
+	struct msi_pirq_entry *pirq_entry;
+
+	list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
+		if (pirq_entry->entry_nr == entry_nr) {
+			spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
+			list_del(&pirq_entry->list);
+			spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
+			kfree(pirq_entry);
+			return;
+		}
+	}
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+/*
+ * pciback will provide device's owner
+ */
+static int (*get_owner)(struct pci_dev *dev);
+
+int register_msi_get_owner(int (*func)(struct pci_dev *dev))
+{
+	if (get_owner) {
+		pr_warning("register msi_get_owner again\n");
+		return -EEXIST;
+	}
+	get_owner = func;
+	return 0;
+}
+EXPORT_SYMBOL(register_msi_get_owner);
+
+int unregister_msi_get_owner(int (*func)(struct pci_dev *dev))
+{
+	if (get_owner != func)
+		return -EINVAL;
+	get_owner = NULL;
+	return 0;
+}
+EXPORT_SYMBOL(unregister_msi_get_owner);
+
+static int msi_get_dev_owner(struct pci_dev *dev)
+{
+	int owner;
+
+	BUG_ON(!is_initial_xendomain());
+	if (get_owner && (owner = get_owner(dev)) >= 0) {
+		dev_info(&dev->dev, "get owner: %x \n", owner);
+		return owner;
+	}
+
+	return DOMID_SELF;
+}
+#else
+#define msi_get_dev_owner(dev) ({ BUG(); DOMID_SELF; })
+#endif
+
+static int msi_unmap_pirq(struct pci_dev *dev, int pirq)
+{
+	struct physdev_unmap_pirq unmap;
+	int rc;
+
+	unmap.domid = msi_get_dev_owner(dev);
+	/* See comments in msi_map_vector, input parameter pirq means
+	 * irq number only if the device belongs to dom0 itself.
+	 */
+	unmap.pirq = (unmap.domid != DOMID_SELF)
+		? pirq : evtchn_get_xen_pirq(pirq);
+
+	if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap)))
+		dev_warn(&dev->dev, "unmap irq %d failed\n", pirq);
+
+	if (rc < 0)
+		return rc;
+
+	if (unmap.domid == DOMID_SELF)
+		evtchn_map_pirq(pirq, 0);
+
+	return 0;
+}
+
+static u64 find_table_base(struct pci_dev *dev, int pos)
+{
+	u8 bar;
+	u32 reg;
+	unsigned long flags;
+
+ 	pci_read_config_dword(dev, msix_table_offset_reg(pos), &reg);
+	bar = reg & PCI_MSIX_FLAGS_BIRMASK;
+
+	flags = pci_resource_flags(dev, bar);
+	if (flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | IORESOURCE_BUSY))
+		return 0;
+
+	return pci_resource_start(dev, bar);
+}
+
+/*
+ * Protected by msi_lock
+ */
+static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base)
+{
+	struct physdev_map_pirq map_irq;
+	int rc = -EINVAL;
+	domid_t domid = DOMID_SELF;
+
+	domid = msi_get_dev_owner(dev);
+
+	map_irq.domid = domid;
+	map_irq.type = MAP_PIRQ_TYPE_MSI_SEG;
+	map_irq.index = -1;
+	map_irq.pirq = -1;
+	map_irq.bus = dev->bus->number | (pci_domain_nr(dev->bus) << 16);
+	map_irq.devfn = dev->devfn;
+	map_irq.entry_nr = entry_nr;
+	map_irq.table_base = table_base;
+
+	if (pci_seg_supported)
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+#if CONFIG_XEN_COMPAT < 0x040200
+	if (rc == -EINVAL && !pci_domain_nr(dev->bus)) {
+		map_irq.type = MAP_PIRQ_TYPE_MSI;
+		map_irq.index = -1;
+		map_irq.pirq = -1;
+		map_irq.bus = dev->bus->number;
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+		if (rc != -EINVAL)
+			pci_seg_supported = 0;
+	}
+#endif
+	if (rc)
+		dev_warn(&dev->dev, "map irq failed\n");
+
+	if (rc < 0)
+		return rc;
+	/* This happens when MSI support is not enabled in older Xen. */
+	if (rc == 0 && map_irq.pirq < 0)
+		return -ENOSYS;
+
+	BUG_ON(map_irq.pirq <= 0);
+
+	/* If mapping of this particular MSI is on behalf of another domain,
+	 * we do not need to get an irq in dom0. This also implies:
+	 * dev->irq in dom0 will be 'Xen pirq' if this device belongs to
+	 * to another domain, and will be 'Linux irq' if it belongs to dom0.
+	 */
+	if (domid == DOMID_SELF) {
+		rc = evtchn_map_pirq(-1, map_irq.pirq);
+		dev_printk(KERN_DEBUG, &dev->dev,
+			   "irq %d (%d) for MSI/MSI-X\n",
+			   rc, map_irq.pirq);
+		return rc;
+	}
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for dom%d MSI/MSI-X\n",
+		   map_irq.pirq, domid);
+	return map_irq.pirq;
+}
+
+static void pci_intx_for_msi(struct pci_dev *dev, int enable)
+{
+	if (!(dev->dev_flags & PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG))
+		pci_intx(dev, enable);
+}
+
+void pci_restore_msi_state(struct pci_dev *dev)
+{
+	int rc = -ENOSYS;
+
+	if (!dev->msi_enabled && !dev->msix_enabled)
+		return;
+
+	pci_intx_for_msi(dev, 0);
+	if (dev->msi_enabled) {
+		int pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+
+		msi_set_enable(dev, pos, 0);
+	}
+	if (dev->msix_enabled)
+		msix_set_enable(dev, 0);
+
+	if (pci_seg_supported) {
+		struct physdev_pci_device restore = {
+			.seg = pci_domain_nr(dev->bus),
+			.bus = dev->bus->number,
+			.devfn = dev->devfn
+		};
+
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi_ext,
+					   &restore);
+	}
+#if CONFIG_XEN_COMPAT < 0x040200
+	if (rc == -ENOSYS && !pci_domain_nr(dev->bus)) {
+		struct physdev_restore_msi restore = {
+			.bus = dev->bus->number,
+			.devfn = dev->devfn
+		};
+
+		pci_seg_supported = false;
+		rc = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
+	}
+#endif
+	WARN(rc && rc != -ENOSYS, "restore_msi -> %d\n", rc);
+}
+EXPORT_SYMBOL_GPL(pci_restore_msi_state);
+
+/**
+ * msi_capability_init - configure device's MSI capability structure
+ * @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: number of interrupts to allocate
+ *
+ * Setup the MSI capability structure of the device with the requested
+ * number of interrupts.  A return value of zero indicates the successful
+ * setup of an entry with the new MSI irq.  A negative return value indicates
+ * an error, and a positive return value indicates the number of interrupts
+ * which could have been allocated.
+ */
+static int msi_capability_init(struct pci_dev *dev, int nvec)
+{
+	int pos, pirq;
+	u16 control;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	msi_set_enable(dev, pos, 0);	/* Disable MSI during set up */
+
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+
+	WARN_ON(nvec > 1); /* XXX */
+	pirq = msi_map_vector(dev, 0, 0);
+	if (pirq < 0)
+		return -EBUSY;
+
+	/* Set MSI enabled bits	 */
+	pci_intx_for_msi(dev, 0);
+	msi_set_enable(dev, pos, 1);
+	dev->msi_enabled = 1;
+
+	dev->irq = pirq;
+	return 0;
+}
+
+/**
+ * msix_capability_init - configure device's MSI-X capability
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @entries: pointer to an array of struct msix_entry entries
+ * @nvec: number of @entries
+ *
+ * Setup the MSI-X capability structure of device function with a
+ * single MSI-X irq. A return of zero indicates the successful setup of
+ * requested MSI-X entries with allocated irqs or non-zero for otherwise.
+ **/
+static int msix_capability_init(struct pci_dev *dev,
+				struct msix_entry *entries, int nvec)
+{
+	u64 table_base;
+	int pirq, i, j, mapped, pos;
+	u16 control;
+	struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
+	struct msi_pirq_entry *pirq_entry;
+
+	if (!msi_dev_entry)
+		return -ENOMEM;
+
+	msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
+
+	/* Ensure MSI-X is disabled while it is set up */
+	control &= ~PCI_MSIX_FLAGS_ENABLE;
+	pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
+
+	table_base = find_table_base(dev, pos);
+	if (!table_base)
+		return -ENODEV;
+
+	/*
+	 * Some devices require MSI-X to be enabled before we can touch the
+	 * MSI-X registers.  We need to mask all the vectors to prevent
+	 * interrupts coming in before they're fully set up.
+	 */
+	control |= PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE;
+	pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
+
+	for (i = 0; i < nvec; i++) {
+		mapped = 0;
+		list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
+			if (pirq_entry->entry_nr == entries[i].entry) {
+				dev_warn(&dev->dev,
+					 "msix entry %d was not freed\n",
+					 entries[i].entry);
+				(entries + i)->vector = pirq_entry->pirq;
+				mapped = 1;
+				break;
+			}
+		}
+		if (mapped)
+			continue;
+		pirq = msi_map_vector(dev, entries[i].entry, table_base);
+		if (pirq < 0)
+			break;
+		attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry);
+		(entries + i)->vector = pirq;
+	}
+
+	if (i != nvec) {
+		int avail = i - 1;
+		for (j = --i; j >= 0; j--) {
+			msi_unmap_pirq(dev, entries[j].vector);
+			detach_pirq_entry(entries[j].entry, msi_dev_entry);
+			entries[j].vector = 0;
+		}
+		/* If we had some success report the number of irqs
+		 * we succeeded in setting up.
+		 */
+		if (avail <= 0)
+			avail = -EBUSY;
+		return avail;
+	}
+
+	/* Set MSI-X enabled bits and unmask the function */
+	pci_intx_for_msi(dev, 0);
+	dev->msix_enabled = 1;
+
+	control &= ~PCI_MSIX_FLAGS_MASKALL;
+	pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
+
+	return 0;
+}
+
+/**
+ * pci_msi_check_device - check whether MSI may be enabled on a device
+ * @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: how many MSIs have been requested ?
+ * @type: are we checking for MSI or MSI-X ?
+ *
+ * Look at global flags, the device itself, and its parent busses
+ * to determine if MSI/-X are supported for the device. If MSI/-X is
+ * supported return 0, else return an error code.
+ **/
+static int pci_msi_check_device(struct pci_dev *dev, int nvec, int type)
+{
+	struct pci_bus *bus;
+	int ret;
+
+	/* MSI must be globally enabled and supported by the device */
+	if (!pci_msi_enable || !dev || dev->no_msi)
+		return -EINVAL;
+
+	/*
+	 * You can't ask to have 0 or less MSIs configured.
+	 *  a) it's stupid ..
+	 *  b) the list manipulation code assumes nvec >= 1.
+	 */
+	if (nvec < 1)
+		return -ERANGE;
+
+	/*
+	 * Any bridge which does NOT route MSI transactions from its
+	 * secondary bus to its primary bus must set NO_MSI flag on
+	 * the secondary pci_bus.
+	 * We expect only arch-specific PCI host bus controller driver
+	 * or quirks for specific PCI bridges to be setting NO_MSI.
+	 */
+	for (bus = dev->bus; bus; bus = bus->parent)
+		if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
+			return -EINVAL;
+
+	ret = arch_msi_check_device(dev, nvec, type);
+	if (ret)
+		return ret;
+
+	if (!pci_find_capability(dev, type))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @dev: device to configure
+ * @nvec: number of interrupts to configure
+ *
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs.  If it
+ * is unable to allocate the number of interrupts requested, it returns
+ * the number of interrupts it might be able to allocate.  If it successfully
+ * allocates at least the number of interrupts requested, it returns 0 and
+ * updates the @dev's irq member to the lowest new interrupt number; the
+ * other interrupt numbers allocated to this device are consecutive.
+ */
+extern int pci_frontend_enable_msi(struct pci_dev *dev);
+int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
+{
+	int temp, status, pos, maxvec;
+	u16 msgctl;
+	struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (!pos)
+		return -EINVAL;
+	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+	maxvec = 1 << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1);
+	if (nvec > maxvec)
+		return maxvec;
+
+	status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
+	if (status)
+		return status;
+
+	if (!is_initial_xendomain()) {
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+		int ret;
+
+		temp = dev->irq;
+		WARN_ON(nvec > 1); /* XXX */
+		ret = pci_frontend_enable_msi(dev);
+		if (ret)
+			return ret;
+
+		dev->irq = evtchn_map_pirq(-1, dev->irq);
+		dev->msi_enabled = 1;
+		msi_dev_entry->default_irq = temp;
+
+		return ret;
+#else
+		return -EOPNOTSUPP;
+#endif
+	}
+
+	temp = dev->irq;
+
+	/* Check whether driver already requested MSI-X irqs */
+	if (dev->msix_enabled) {
+		dev_info(&dev->dev, "can't enable MSI "
+			 "(MSI-X already enabled)\n");
+		return -EINVAL;
+	}
+
+	status = msi_capability_init(dev, nvec);
+	if ( !status )
+		msi_dev_entry->default_irq = temp;
+
+	return status;
+}
+EXPORT_SYMBOL(pci_enable_msi_block);
+
+extern void pci_frontend_disable_msi(struct pci_dev* dev);
+void pci_msi_shutdown(struct pci_dev *dev)
+{
+	int pirq, pos;
+	struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
+
+	if (!pci_msi_enable || !dev || !dev->msi_enabled)
+		return;
+
+	if (!is_initial_xendomain()) {
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+		evtchn_map_pirq(dev->irq, 0);
+		pci_frontend_disable_msi(dev);
+		dev->irq = msi_dev_entry->default_irq;
+		dev->msi_enabled = 0;
+#endif
+		return;
+	}
+
+	pirq = dev->irq;
+	/* Restore dev->irq to its default pin-assertion vector */
+	dev->irq = msi_dev_entry->default_irq;
+	msi_unmap_pirq(dev, pirq);
+
+	/* Disable MSI mode */
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	msi_set_enable(dev, pos, 0);
+	pci_intx_for_msi(dev, 1);
+	dev->msi_enabled = 0;
+}
+
+void pci_disable_msi(struct pci_dev *dev)
+{
+	pci_msi_shutdown(dev);
+}
+EXPORT_SYMBOL(pci_disable_msi);
+
+/**
+ * pci_msix_table_size - return the number of device's MSI-X table entries
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ */
+int pci_msix_table_size(struct pci_dev *dev)
+{
+	int pos;
+	u16 control;
+
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!pos)
+		return 0;
+
+	pci_read_config_word(dev, msi_control_reg(pos), &control);
+	return multi_msix_capable(control);
+}
+
+/**
+ * pci_enable_msix - configure device's MSI-X capability structure
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @entries: pointer to an array of MSI-X entries
+ * @nvec: number of MSI-X irqs requested for allocation by device driver
+ *
+ * Setup the MSI-X capability structure of device function with the number
+ * of requested irqs upon its software driver call to request for
+ * MSI-X mode enabled on its hardware device function. A return of zero
+ * indicates the successful configuration of MSI-X capability structure
+ * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
+ * Or a return of > 0 indicates that driver request is exceeding the number
+ * of irqs or MSI-X vectors available. Driver should use the returned value to
+ * re-send its request.
+ **/
+extern int pci_frontend_enable_msix(struct pci_dev *dev,
+		struct msix_entry *entries, int nvec);
+int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
+{
+	int status, nr_entries;
+	int i, j, temp;
+	struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
+
+	if (!entries)
+		return -EINVAL;
+
+	if (!is_initial_xendomain()) {
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+		struct msi_pirq_entry *pirq_entry;
+		int ret, irq;
+
+		temp = dev->irq;
+		ret = pci_frontend_enable_msix(dev, entries, nvec);
+		if (ret) {
+			dev_warn(&dev->dev,
+				 "got %x from frontend_enable_msix\n", ret);
+			return ret;
+		}
+		dev->msix_enabled = 1;
+		msi_dev_entry->default_irq = temp;
+
+		for (i = 0; i < nvec; i++) {
+			int mapped = 0;
+
+			list_for_each_entry(pirq_entry, &msi_dev_entry->pirq_list_head, list) {
+				if (pirq_entry->entry_nr == entries[i].entry) {
+					irq = pirq_entry->pirq;
+					BUG_ON(entries[i].vector != evtchn_get_xen_pirq(irq));
+					entries[i].vector = irq;
+					mapped = 1;
+					break;
+				}
+			}
+			if (mapped)
+				continue;
+			irq = evtchn_map_pirq(-1, entries[i].vector);
+			attach_pirq_entry(irq, entries[i].entry, msi_dev_entry);
+			entries[i].vector = irq;
+		}
+		return 0;
+#else
+		return -EOPNOTSUPP;
+#endif
+	}
+
+	status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSIX);
+	if (status)
+		return status;
+
+	nr_entries = pci_msix_table_size(dev);
+	if (nvec > nr_entries)
+		return nr_entries;
+
+	/* Check for any invalid entries */
+	for (i = 0; i < nvec; i++) {
+		if (entries[i].entry >= nr_entries)
+			return -EINVAL;		/* invalid entry */
+		for (j = i + 1; j < nvec; j++) {
+			if (entries[i].entry == entries[j].entry)
+				return -EINVAL;	/* duplicate entry */
+		}
+	}
+
+	temp = dev->irq;
+	/* Check whether driver already requested for MSI vector */
+	if (dev->msi_enabled) {
+		dev_info(&dev->dev, "can't enable MSI-X "
+		       "(MSI IRQ already assigned)\n");
+		return -EINVAL;
+	}
+
+	status = msix_capability_init(dev, entries, nvec);
+
+	if ( !status )
+		msi_dev_entry->default_irq = temp;
+
+	return status;
+}
+EXPORT_SYMBOL(pci_enable_msix);
+
+extern void pci_frontend_disable_msix(struct pci_dev* dev);
+void pci_msix_shutdown(struct pci_dev *dev)
+{
+	if (!pci_msi_enable || !dev || !dev->msix_enabled)
+		return;
+
+	if (!is_initial_xendomain())
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+		pci_frontend_disable_msix(dev);
+#else
+		return;
+#endif
+
+	msi_remove_pci_irq_vectors(dev);
+
+	/* Disable MSI mode */
+	if (is_initial_xendomain()) {
+		msix_set_enable(dev, 0);
+		pci_intx_for_msi(dev, 1);
+	}
+	dev->msix_enabled = 0;
+}
+
+void pci_disable_msix(struct pci_dev *dev)
+{
+	pci_msix_shutdown(dev);
+}
+EXPORT_SYMBOL(pci_disable_msix);
+
+/**
+ * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state
+ * @dev: pointer to the pci_dev data structure of MSI(X) device function
+ *
+ * Being called during hotplug remove, from which the device function
+ * is hot-removed. All previous assigned MSI/MSI-X irqs, if
+ * allocated for this device function, are reclaimed to unused state,
+ * which may be used later on.
+ **/
+void msi_remove_pci_irq_vectors(struct pci_dev *dev)
+{
+	unsigned long flags;
+	struct msi_dev_list *msi_dev_entry;
+	struct msi_pirq_entry *pirq_entry, *tmp;
+
+	if (!pci_msi_enable || !dev)
+		return;
+
+	msi_dev_entry = get_msi_dev_pirq_list(dev);
+
+	spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
+	if (!list_empty(&msi_dev_entry->pirq_list_head))
+		list_for_each_entry_safe(pirq_entry, tmp,
+		                         &msi_dev_entry->pirq_list_head, list) {
+			if (is_initial_xendomain())
+				msi_unmap_pirq(dev, pirq_entry->pirq);
+			else
+				evtchn_map_pirq(pirq_entry->pirq, 0);
+			list_del(&pirq_entry->list);
+			kfree(pirq_entry);
+		}
+	spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
+	dev->irq = msi_dev_entry->default_irq;
+}
+
+void pci_no_msi(void)
+{
+	pci_msi_enable = 0;
+}
+
+/**
+ * pci_msi_enabled - is MSI enabled?
+ *
+ * Returns true if MSI has not been disabled by the command-line option
+ * pci=nomsi.
+ **/
+int pci_msi_enabled(void)
+{
+	return pci_msi_enable;
+}
+EXPORT_SYMBOL(pci_msi_enabled);
+
+void pci_msi_init_pci_dev(struct pci_dev *dev)
+{
+	int pos;
+
+#ifndef CONFIG_XEN
+	INIT_LIST_HEAD(&dev->msi_list);
+#endif
+	/* Disable the msi hardware to avoid screaming interrupts
+	 * during boot.  This is the power on reset default so
+	 * usually this should be a noop.
+	 */
+	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (pos)
+		msi_set_enable(dev, pos, 0);
+	msix_set_enable(dev, 0);
+}
diff --git a/drivers/pci/pci-iomul.c b/drivers/pci/pci-iomul.c
new file mode 100644
index 0000000..395c96b
--- /dev/null
+++ b/drivers/pci/pci-iomul.c
@@ -0,0 +1,440 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include "iomulti.h"
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <xen/public/iomulti.h>
+
+struct pci_iomul_data {
+	struct mutex lock;
+
+	struct pci_dev *pdev;
+	struct pci_iomul_switch *sw;
+	struct pci_iomul_slot *slot;	/* slot::kref */
+	struct pci_iomul_func **func;	/* when dereferencing,
+					   sw->lock is necessary */
+};
+
+static int pci_iomul_func_ioport(struct pci_iomul_func *func,
+				 uint8_t bar, uint64_t offset, int *port)
+{
+	if (!(func->io_bar & (1 << bar)))
+		return -EINVAL;
+
+	*port = func->resource[bar].start + offset;
+	if (*port < func->resource[bar].start ||
+	    *port > func->resource[bar].end)
+		return -EINVAL;
+
+	return 0;
+}
+
+static inline int pci_iomul_valid(struct pci_iomul_data *iomul)
+{
+	BUG_ON(!mutex_is_locked(&iomul->lock));
+	BUG_ON(!mutex_is_locked(&iomul->sw->lock));
+	return pci_iomul_switch_io_allocated(iomul->sw) &&
+		*iomul->func != NULL;
+}
+
+static void __pci_iomul_enable_io(struct pci_dev *pdev)
+{
+	uint16_t cmd;
+
+	pci_dev_get(pdev);
+	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+	cmd |= PCI_COMMAND_IO;
+	pci_write_config_word(pdev, PCI_COMMAND, cmd);
+}
+
+static void __pci_iomul_disable_io(struct pci_iomul_data *iomul,
+				   struct pci_dev *pdev)
+{
+	uint16_t cmd;
+
+	if (!pci_iomul_valid(iomul))
+		return;
+
+	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+	cmd &= ~PCI_COMMAND_IO;
+	pci_write_config_word(pdev, PCI_COMMAND, cmd);
+	pci_dev_put(pdev);
+}
+
+static int pci_iomul_open(struct inode *inode, struct file *filp)
+{
+	struct pci_iomul_data *iomul;
+	iomul = kmalloc(sizeof(*iomul), GFP_KERNEL);
+	if (iomul == NULL)
+		return -ENOMEM;
+
+	mutex_init(&iomul->lock);
+	iomul->pdev = NULL;
+	iomul->sw = NULL;
+	iomul->slot = NULL;
+	iomul->func = NULL;
+	filp->private_data = (void*)iomul;
+
+	return nonseekable_open(inode, filp);
+}
+
+static int pci_iomul_release(struct inode *inode, struct file *filp)
+{
+	struct pci_iomul_data *iomul =
+		(struct pci_iomul_data*)filp->private_data;
+	struct pci_iomul_switch *sw;
+	struct pci_iomul_slot *slot = NULL;
+
+	mutex_lock(&iomul->lock);
+	sw = iomul->sw;
+	slot = iomul->slot;
+	if (iomul->pdev != NULL) {
+		if (sw != NULL) {
+			mutex_lock(&sw->lock);
+			if (sw->current_pdev == iomul->pdev) {
+				__pci_iomul_disable_io(iomul,
+						       sw->current_pdev);
+				sw->current_pdev = NULL;
+			}
+			sw->count--;
+			if (sw->count == 0) {
+				release_region(sw->io_region->start, sw->io_region->end - sw->io_region->start + 1);
+				sw->io_region = NULL;
+			}
+			mutex_unlock(&sw->lock);
+		}
+		pci_dev_put(iomul->pdev);
+	}
+	mutex_unlock(&iomul->lock);
+
+	if (slot != NULL)
+		pci_iomul_slot_put(slot);
+	if (sw != NULL)
+		pci_iomul_switch_put(sw);
+	kfree(iomul);
+	return 0;
+}
+
+static long pci_iomul_setup(struct pci_iomul_data *iomul,
+			    struct pci_iomul_setup __user *arg)
+{
+	long error = 0;
+	struct pci_iomul_setup setup;
+	struct pci_iomul_switch *sw = NULL;
+	struct pci_iomul_slot *slot;
+	struct pci_bus *pbus;
+	struct pci_dev *pdev;
+
+	if (copy_from_user(&setup, arg, sizeof(setup)))
+		return -EFAULT;
+
+	pbus = pci_find_bus(setup.segment, setup.bus);
+	if (pbus == NULL)
+		return -ENODEV;
+	pdev = pci_get_slot(pbus, setup.dev);
+	if (pdev == NULL)
+		return -ENODEV;
+
+	mutex_lock(&iomul->lock);
+	if (iomul->sw != NULL) {
+		error = -EBUSY;
+		goto out0;
+	}
+
+	pci_iomul_get_lock_switch(pdev, &sw, &slot);
+	if (sw == NULL || slot == NULL) {
+		error = -ENODEV;
+		goto out0;
+	}
+	if (!pci_iomul_switch_io_allocated(sw)) {
+		error = -ENODEV;
+		goto out;
+	}
+
+	if (slot->func[setup.func] == NULL) {
+		error = -ENODEV;
+		goto out;
+	}
+
+	if (sw->count == 0) {
+		BUG_ON(sw->io_region != NULL);
+		sw->io_region =
+			request_region(sw->io_base,
+				       sw->io_limit - sw->io_base + 1,
+				       "PCI IO Multiplexer driver");
+		if (sw->io_region == NULL) {
+			mutex_unlock(&sw->lock);
+			error = -EBUSY;
+			goto out;
+		}
+	}
+	sw->count++;
+	pci_iomul_slot_get(slot);
+
+	iomul->pdev = pdev;
+	iomul->sw = sw;
+	iomul->slot = slot;
+	iomul->func = &slot->func[setup.func];
+
+out:
+	mutex_unlock(&sw->lock);
+out0:
+	mutex_unlock(&iomul->lock);
+	if (error != 0) {
+		if (sw != NULL)
+			pci_iomul_switch_put(sw);
+		pci_dev_put(pdev);
+	}
+	return error;
+}
+
+static int pci_iomul_lock(struct pci_iomul_data *iomul,
+			  struct pci_iomul_switch **sw,
+			  struct pci_iomul_func **func)
+{
+	mutex_lock(&iomul->lock);
+	*sw = iomul->sw;
+	if (*sw == NULL) {
+		mutex_unlock(&iomul->lock);
+		return -ENODEV;
+	}
+	mutex_lock(&(*sw)->lock);
+	if (!pci_iomul_valid(iomul)) {
+		mutex_unlock(&(*sw)->lock);
+		mutex_unlock(&iomul->lock);
+		return -ENODEV;
+	}
+	*func = *iomul->func;
+
+	return 0;
+}
+
+static long pci_iomul_disable_io(struct pci_iomul_data *iomul)
+{
+	long error = 0;
+	struct pci_iomul_switch *sw;
+	struct pci_iomul_func *dummy_func;
+	struct pci_dev *pdev;
+
+	if (pci_iomul_lock(iomul, &sw, &dummy_func) < 0)
+		return -ENODEV;
+
+	pdev = iomul->pdev;
+	if (pdev == NULL)
+		error = -ENODEV;
+
+	if (pdev != NULL && sw->current_pdev == pdev) {
+		__pci_iomul_disable_io(iomul, pdev);
+		sw->current_pdev = NULL;
+	}
+
+	mutex_unlock(&sw->lock);
+	mutex_unlock(&iomul->lock);
+	return error;
+}
+
+static void pci_iomul_switch_to(
+	struct pci_iomul_data *iomul, struct pci_iomul_switch *sw,
+	struct pci_dev *next_pdev)
+{
+	if (sw->current_pdev == next_pdev)
+		/* nothing to do */
+		return;
+
+	if (sw->current_pdev != NULL)
+		__pci_iomul_disable_io(iomul, sw->current_pdev);
+
+	__pci_iomul_enable_io(next_pdev);
+	sw->current_pdev = next_pdev;
+}
+
+static long pci_iomul_in(struct pci_iomul_data *iomul,
+			 struct pci_iomul_in __user *arg)
+{
+	struct pci_iomul_in in;
+	struct pci_iomul_switch *sw;
+	struct pci_iomul_func *func;
+
+	long error = 0;
+	int port;
+	uint32_t value = 0;
+
+	if (copy_from_user(&in, arg, sizeof(in)))
+		return -EFAULT;
+
+	if (pci_iomul_lock(iomul, &sw, &func) < 0)
+		return -ENODEV;
+
+	error = pci_iomul_func_ioport(func, in.bar, in.offset, &port);
+	if (error)
+		goto out;
+
+	pci_iomul_switch_to(iomul, sw, iomul->pdev);
+	switch (in.size) {
+	case 4:
+		value = inl(port);
+		break;
+	case 2:
+		value = inw(port);
+		break;
+	case 1:
+		value = inb(port);
+		break;
+	default:
+		error = -EINVAL;
+		break;
+	}
+
+out:
+	mutex_unlock(&sw->lock);
+	mutex_unlock(&iomul->lock);
+
+	if (error == 0 && put_user(value, &arg->value))
+		return -EFAULT;
+	return error;
+}
+
+static long pci_iomul_out(struct pci_iomul_data *iomul,
+			  struct pci_iomul_out __user *arg)
+{
+	struct pci_iomul_in out;
+	struct pci_iomul_switch *sw;
+	struct pci_iomul_func *func;
+
+	long error = 0;
+	int port;
+
+	if (copy_from_user(&out, arg, sizeof(out)))
+		return -EFAULT;
+
+	if (pci_iomul_lock(iomul, &sw, &func) < 0)
+		return -ENODEV;
+
+	error = pci_iomul_func_ioport(func, out.bar, out.offset, &port);
+	if (error)
+		goto out;
+
+	pci_iomul_switch_to(iomul, sw, iomul->pdev);
+	switch (out.size) {
+	case 4:
+		outl(out.value, port);
+		break;
+	case 2:
+		outw(out.value, port);
+		break;
+	case 1:
+		outb(out.value, port);
+		break;
+	default:
+		error = -EINVAL;
+		break;
+	}
+
+out:
+	mutex_unlock(&sw->lock);
+	mutex_unlock(&iomul->lock);
+	return error;
+}
+
+static long pci_iomul_ioctl(struct file *filp,
+			    unsigned int cmd, unsigned long arg)
+{
+	long error;
+	struct pci_iomul_data *iomul =
+		(struct pci_iomul_data*)filp->private_data;
+
+	if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	switch (cmd) {
+	case PCI_IOMUL_SETUP:
+		error = pci_iomul_setup(iomul,
+					(struct pci_iomul_setup __user *)arg);
+		break;
+	case PCI_IOMUL_DISABLE_IO:
+		error = pci_iomul_disable_io(iomul);
+		break;
+	case PCI_IOMUL_IN:
+		error = pci_iomul_in(iomul, (struct pci_iomul_in __user *)arg);
+		break;
+	case PCI_IOMUL_OUT:
+		error = pci_iomul_out(iomul,
+				      (struct pci_iomul_out __user *)arg);
+		break;
+	default:
+		error = -ENOSYS;
+		break;
+	}
+
+	return error;
+}
+
+static const struct file_operations pci_iomul_fops = {
+	.owner = THIS_MODULE,
+
+	.open = pci_iomul_open,
+	.release = pci_iomul_release,
+
+	.unlocked_ioctl = pci_iomul_ioctl,
+};
+
+static struct miscdevice pci_iomul_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "pci_iomul",
+	.nodename = "xen/pci_iomul",
+	.fops = &pci_iomul_fops,
+};
+
+static int __init pci_iomul_init(void)
+{
+	int error;
+
+	if (!is_initial_xendomain())
+		return -ENODEV;
+
+	error = misc_register(&pci_iomul_miscdev);
+	if (error) {
+		pr_alert("Couldn't register /dev/xen/pci_iomul");
+		return error;
+	}
+	pr_info("PCI IO multiplexer device installed\n");
+	return 0;
+}
+
+#ifdef MODULE
+static void __exit pci_iomul_cleanup(void)
+{
+	misc_deregister(&pci_iomul_miscdev);
+}
+module_exit(pci_iomul_cleanup);
+#endif
+
+/*
+ * This must be called after pci fixup final which is called by
+ * device_initcall(pci_init).
+ */
+late_initcall(pci_iomul_init);
+
+MODULE_ALIAS("devname:xen/pci_iomul");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Isaku Yamahata <yamahata@valinux.co.jp>");
+MODULE_DESCRIPTION("PCI IO space multiplexing driver");
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index af295bb..5b61db4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -479,7 +479,12 @@ pci_find_parent_resource(const struct pci_dev *dev, struct resource *res)
  * Restore the BAR values for a given device, so as to make it
  * accessible by its driver.
  */
+#ifndef CONFIG_XEN
 static void
+#else
+EXPORT_SYMBOL_GPL(pci_restore_bars);
+void
+#endif
 pci_restore_bars(struct pci_dev *dev)
 {
 	int i;
@@ -3633,6 +3638,13 @@ resource_size_t pci_specified_resource_alignment(struct pci_dev *dev)
  */
 int pci_is_reassigndev(struct pci_dev *dev)
 {
+#ifdef CONFIG_PCI_GUESTDEV
+	int result;
+
+	result = pci_is_guestdev_to_reassign(dev);
+	if (result)
+		return result;
+#endif /* CONFIG_PCI_GUESTDEV */
 	return (pci_specified_resource_alignment(dev) != 0);
 }
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 1009a5e..1de5e19 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -326,4 +326,26 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe)
 }
 #endif
 
+#ifdef CONFIG_PCI_GUESTDEV
+extern int pci_is_guestdev_to_reassign(struct pci_dev *dev);
+extern int pci_is_iomuldev(struct pci_dev *dev);
+#else
+#define pci_is_iomuldev(dev)	0
+#endif
+
+#ifdef CONFIG_PCI_RESERVE
+unsigned long pci_reserve_size_io(struct pci_bus *bus);
+unsigned long pci_reserve_size_mem(struct pci_bus *bus);
+#else
+static inline unsigned long pci_reserve_size_io(struct pci_bus *bus)
+{
+	return 0;
+}
+
+static inline unsigned long pci_reserve_size_mem(struct pci_bus *bus)
+{
+	return 0;
+}
+#endif /* CONFIG_PCI_RESERVE */
+
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 71eac9c..a8182c3 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1186,6 +1186,11 @@ static void pci_init_capabilities(struct pci_dev *dev)
 	/* Vital Product Data */
 	pci_vpd_pci22_init(dev);
 
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		return;
+#endif
+
 	/* Alternative Routing-ID Forwarding */
 	pci_enable_ari(dev);
 
@@ -1307,13 +1312,20 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
 		return 0; /* Already scanned the entire slot */
 
 	dev = pci_scan_single_device(bus, devfn);
-	if (!dev)
+	if (!dev) {
+#ifdef pcibios_scan_all_fns
+		if (!pcibios_scan_all_fns(bus, devfn))
+#endif
 		return 0;
-	if (!dev->is_added)
+	} else if (!dev->is_added)
 		nr++;
 
 	if (pci_ari_enabled(bus))
 		next_fn = next_ari_fn;
+#ifdef pcibios_scan_all_fns
+	else if (pcibios_scan_all_fns(bus, devfn))
+		next_fn = next_trad_fn;
+#endif
 	else if (dev->multifunction)
 		next_fn = next_trad_fn;
 
diff --git a/drivers/pci/reserve.c b/drivers/pci/reserve.c
new file mode 100644
index 0000000..2be94ac
--- /dev/null
+++ b/drivers/pci/reserve.c
@@ -0,0 +1,137 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include <asm/setup.h>
+
+static char pci_reserve_param[COMMAND_LINE_SIZE];
+
+/* pci_reserve=	[PCI]
+ * Format: [<sbdf>[+IO<size>][+MEM<size>]][,<sbdf>...]
+ * Format of sbdf: [<segment>:]<bus>:<dev>.<func>
+ */
+static int pci_reserve_parse_size(const char *str,
+				  unsigned long *io_size,
+				  unsigned long *mem_size)
+{
+	if (sscanf(str, "io%lx", io_size) == 1 ||
+	    sscanf(str, "IO%lx", io_size) == 1)
+		return 0;
+
+	if (sscanf(str, "mem%lx", mem_size) == 1 ||
+	    sscanf(str, "MEM%lx", mem_size) == 1)
+		return 0;
+
+	return -EINVAL;
+}
+
+static int pci_reserve_parse_one(const char *str,
+				 int *seg, int *bus, int *dev, int *func,
+				 unsigned long *io_size,
+				 unsigned long *mem_size)
+{
+	char *p;
+
+	*io_size = 0;
+	*mem_size = 0;
+
+	if (sscanf(str, "%x:%x:%x.%x", seg, bus, dev, func) != 4) {
+		*seg = 0;
+		if (sscanf(str, "%x:%x.%x", bus, dev, func) != 3) {
+			return -EINVAL;
+		}
+	}
+
+	p = strchr(str, '+');
+	if (p == NULL)
+		return -EINVAL;
+	if (pci_reserve_parse_size(++p, io_size, mem_size))
+		return -EINVAL;
+
+	p = strchr(p, '+');
+	return p ? pci_reserve_parse_size(p + 1, io_size, mem_size) : 0;
+}
+
+static unsigned long pci_reserve_size(struct pci_bus *pbus, int flags)
+{
+	char *sp;
+	char *ep;
+
+	int seg;
+	int bus;
+	int dev;
+	int func;
+
+	unsigned long io_size;
+	unsigned long mem_size;
+
+	sp = pci_reserve_param;
+
+	do {
+		ep = strchr(sp, ',');
+		if (ep)
+			*ep = '\0';	/* chomp */
+
+		if (pci_reserve_parse_one(sp, &seg, &bus, &dev, &func,
+					  &io_size, &mem_size) == 0) {
+			if (pci_domain_nr(pbus) == seg &&
+			    pbus->number == bus &&
+			    PCI_SLOT(pbus->self->devfn) == dev &&
+			    PCI_FUNC(pbus->self->devfn) == func) {
+				switch (flags) {
+				case IORESOURCE_IO:
+					return io_size;
+				case IORESOURCE_MEM:
+					return mem_size;
+				default:
+					break;
+				}
+			}
+		}
+
+		if (ep) {
+			*ep = ',';	/* restore chomp'ed ',' for later */
+			ep++;
+		}
+		sp = ep;
+	} while (ep);
+
+	return 0;
+}
+
+unsigned long pci_reserve_size_io(struct pci_bus *pbus)
+{
+	return pci_reserve_size(pbus, IORESOURCE_IO);
+}
+
+unsigned long pci_reserve_size_mem(struct pci_bus *pbus)
+{
+	return pci_reserve_size(pbus, IORESOURCE_MEM);
+}
+
+static int __init pci_reserve_setup(char *str)
+{
+	if (!is_initial_xendomain() || strlen(str) >= sizeof(pci_reserve_param))
+		return 0;
+	strlcpy(pci_reserve_param, str, sizeof(pci_reserve_param));
+	return 1;
+}
+__setup("pci_reserve=", pci_reserve_setup);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 86b69f85..d039ee6 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -580,7 +580,7 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
 {
 	struct pci_dev *dev;
 	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
-	unsigned long size = 0, size0 = 0, size1 = 0;
+	unsigned long size = 0, size0 = 0, size1 = 0, res_size;
 	resource_size_t children_add_size = 0;
 
 	if (!b_res)
@@ -614,6 +614,11 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
 	size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
 		calculate_iosize(size, min_size+add_size, size1,
 			resource_size(b_res), 4096);
+	res_size = pci_reserve_size_io(bus);
+	if (size0 < res_size)
+		size0 = ALIGN(res_size, 4096);
+	if (size1 < res_size)
+		size1 = ALIGN(res_size, 4096);
 	if (!size0 && !size1) {
 		if (b_res->start || b_res->end)
 			dev_info(&bus->self->dev, "disabling bridge window "
@@ -722,6 +727,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 			min_align = align1 >> 1;
 		align += aligns[order];
 	}
+	size = max(size, (resource_size_t)pci_reserve_size_mem(bus));
 	size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
 	if (children_add_size > add_size)
 		add_size = children_add_size;
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 3a125b8..6ca5cbd 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -556,7 +556,7 @@ config RTC_DRV_DS1742
 
 config RTC_DRV_EFI
 	tristate "EFI RTC"
-	depends on IA64
+	depends on IA64 || (XEN && EFI)
 	help
 	  If you say yes here you will get support for the EFI
 	  Real Time Clock.
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 16570aa..0db99a5 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -656,7 +656,7 @@ config SCSI_FLASHPOINT
 
 config VMWARE_PVSCSI
 	tristate "VMware PVSCSI driver support"
-	depends on PCI && SCSI && X86
+	depends on PCI && SCSI && !XEN && X86
 	help
 	  This driver supports VMware's para virtualized SCSI HBA.
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/scsi/arcmsr/arcmsr.h b/drivers/scsi/arcmsr/arcmsr.h
index 77b26f5..302efaf 100644
--- a/drivers/scsi/arcmsr/arcmsr.h
+++ b/drivers/scsi/arcmsr/arcmsr.h
@@ -46,7 +46,7 @@
 struct device_attribute;
 /*The limit of outstanding scsi command that firmware can handle*/
 #define ARCMSR_MAX_OUTSTANDING_CMD						256
-#ifdef CONFIG_XEN
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
 	#define ARCMSR_MAX_FREECCB_NUM	160
 #else
 	#define ARCMSR_MAX_FREECCB_NUM	320
diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c
index 1e824fb..5d34c8b 100644
--- a/drivers/sfi/sfi_core.c
+++ b/drivers/sfi/sfi_core.c
@@ -486,6 +486,11 @@ void __init sfi_init(void)
 	if (!acpi_disabled)
 		disable_sfi();
 
+#ifdef CONFIG_XEN
+	if (!is_initial_xendomain())
+		disable_sfi();
+#endif
+
 	if (sfi_disabled)
 		return;
 
diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c
index ef7c52b..875acd8 100644
--- a/drivers/staging/zcache/zcache-main.c
+++ b/drivers/staging/zcache/zcache-main.c
@@ -1757,9 +1757,9 @@ static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
 static struct cleancache_ops zcache_cleancache_ops = {
 	.put_page = zcache_cleancache_put_page,
 	.get_page = zcache_cleancache_get_page,
-	.flush_page = zcache_cleancache_flush_page,
-	.flush_inode = zcache_cleancache_flush_inode,
-	.flush_fs = zcache_cleancache_flush_fs,
+	.invalidate_page = zcache_cleancache_flush_page,
+	.invalidate_inode = zcache_cleancache_flush_inode,
+	.invalidate_fs = zcache_cleancache_flush_fs,
 	.init_shared_fs = zcache_cleancache_init_shared_fs,
 	.init_fs = zcache_cleancache_init_fs
 };
@@ -1867,8 +1867,8 @@ static void zcache_frontswap_init(unsigned ignored)
 static struct frontswap_ops zcache_frontswap_ops = {
 	.put_page = zcache_frontswap_put_page,
 	.get_page = zcache_frontswap_get_page,
-	.flush_page = zcache_frontswap_flush_page,
-	.flush_area = zcache_frontswap_flush_area,
+	.invalidate_page = zcache_frontswap_flush_page,
+	.invalidate_area = zcache_frontswap_flush_area,
 	.init = zcache_frontswap_init
 };
 
diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig
index 4222035..3b60308 100644
--- a/drivers/tty/hvc/Kconfig
+++ b/drivers/tty/hvc/Kconfig
@@ -69,7 +69,7 @@ config HVC_IUCV
 
 config HVC_XEN
 	bool "Xen Hypervisor Console support"
-	depends on XEN
+	depends on PARAVIRT_XEN
 	select HVC_DRIVER
 	select HVC_IRQ
 	default y
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index 591f801..4c97267 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -5,6 +5,7 @@
 
 config SERIAL_8250
 	tristate "8250/16550 and compatible serial support"
+	depends on !XEN_DISABLE_SERIAL
 	select SERIAL_CORE
 	---help---
 	  This selects whether you want to include the driver for the standard
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index e41b9bb..8a01193 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -138,6 +138,8 @@ EXPORT_SYMBOL(tty_mutex);
 /* Spinlock to protect the tty->tty_files list */
 DEFINE_SPINLOCK(tty_files_lock);
 
+bool __read_mostly console_use_vt = true;
+
 static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *);
 static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *);
 ssize_t redirected_tty_write(struct file *, const char __user *,
@@ -1854,6 +1856,10 @@ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp,
 #ifdef CONFIG_VT
 	case MKDEV(TTY_MAJOR, 0): {
 		extern struct tty_driver *console_driver;
+
+		if (!console_use_vt)
+			return get_tty_driver(device, index)
+			       ?: ERR_PTR(-ENODEV);
 		driver = tty_driver_kref_get(console_driver);
 		*index = fg_console;
 		*noctty = 1;
@@ -3403,7 +3409,8 @@ int __init tty_init(void)
 		WARN_ON(device_create_file(consdev, &dev_attr_active) < 0);
 
 #ifdef CONFIG_VT
-	vty_init(&console_fops);
+	if (console_use_vt)
+		vty_init(&console_fops);
 #endif
 	return 0;
 }
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index ade93b2..285c442 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -754,7 +754,7 @@ config FB_UVESA
 
 config FB_VESA
 	bool "VESA VGA graphics support"
-	depends on (FB = y) && X86
+	depends on (FB = y) && X86 && !XEN_UNPRIVILEGED_GUEST
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
 	select FB_CFB_IMAGEBLIT
@@ -2263,7 +2263,7 @@ config FB_VIRTUAL
 
 config XEN_FBDEV_FRONTEND
 	tristate "Xen virtual frame buffer support"
-	depends on FB && XEN
+	depends on FB && PARAVIRT_XEN
 	select FB_SYS_FILLRECT
 	select FB_SYS_COPYAREA
 	select FB_SYS_IMAGEBLIT
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 1a61939..9bdda63 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -8,6 +8,7 @@ config VIRTIO_RING
 	depends on VIRTIO
 
 menu "Virtio drivers"
+	depends on !XEN
 
 config VIRTIO_PCI
 	tristate "PCI driver for virtio devices (EXPERIMENTAL)"
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 877b107..287bf96 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1176,7 +1176,7 @@ config WATCHDOG_RIO
 
 config XEN_WDT
 	tristate "Xen Watchdog support"
-	depends on XEN
+	depends on XEN || PARAVIRT_XEN
 	help
 	  Say Y here to support the hypervisor watchdog capability provided
 	  by Xen 4.0 and newer.  The watchdog timeout period is normally one
diff --git a/drivers/watchdog/xen_wdt.c b/drivers/watchdog/xen_wdt.c
index 49bd9d3..0c619dd 100644
--- a/drivers/watchdog/xen_wdt.c
+++ b/drivers/watchdog/xen_wdt.c
@@ -1,7 +1,7 @@
 /*
  *	Xen Watchdog Driver
  *
- *	(c) Copyright 2010 Novell, Inc.
+ *	(c) Copyright 2010,2011 Novell, Inc.
  *
  *	This program is free software; you can redistribute it and/or
  *	modify it under the terms of the GNU General Public License
@@ -11,7 +11,7 @@
 
 #define DRV_NAME	"wdt"
 #define DRV_VERSION	"0.01"
-#define PFX		DRV_NAME ": "
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
 
 #include <linux/bug.h>
 #include <linux/errno.h>
@@ -27,8 +27,10 @@
 #include <linux/spinlock.h>
 #include <linux/uaccess.h>
 #include <linux/watchdog.h>
+#ifdef CONFIG_PARAVIRT_XEN
 #include <xen/xen.h>
 #include <asm/xen/hypercall.h>
+#endif
 #include <xen/interface/sched.h>
 
 static struct platform_device *platform_device;
@@ -134,8 +136,7 @@ static int xen_wdt_release(struct inode *inode, struct file *file)
 	if (expect_release)
 		xen_wdt_stop();
 	else {
-		printk(KERN_CRIT PFX
-		       "unexpected close, not stopping watchdog!\n");
+		pr_crit("unexpected close, not stopping watchdog!\n");
 		xen_wdt_kick();
 	}
 	is_active = false;
@@ -251,30 +252,27 @@ static int __devinit xen_wdt_probe(struct platform_device *dev)
 	case -EINVAL:
 		if (!timeout) {
 			timeout = WATCHDOG_TIMEOUT;
-			printk(KERN_INFO PFX
-			       "timeout value invalid, using %d\n", timeout);
+			pr_info("timeout value invalid, using %d\n", timeout);
 		}
 
 		ret = misc_register(&xen_wdt_miscdev);
 		if (ret) {
-			printk(KERN_ERR PFX
-			       "cannot register miscdev on minor=%d (%d)\n",
+			pr_err("cannot register miscdev on minor=%d (%d)\n",
 			       WATCHDOG_MINOR, ret);
 			break;
 		}
 
-		printk(KERN_INFO PFX
-		       "initialized (timeout=%ds, nowayout=%d)\n",
-		       timeout, nowayout);
+		pr_info("initialized (timeout=%ds, nowayout=%d)\n",
+			timeout, nowayout);
 		break;
 
 	case -ENOSYS:
-		printk(KERN_INFO PFX "not supported\n");
+		pr_info("not supported\n");
 		ret = -ENODEV;
 		break;
 
 	default:
-		printk(KERN_INFO PFX "bogus return value %d\n", ret);
+		pr_info("bogus return value %d\n", ret);
 		break;
 	}
 
@@ -323,17 +321,19 @@ static int __init xen_wdt_init_module(void)
 {
 	int err;
 
+#ifdef CONFIG_PARAVIRT_XEN
 	if (!xen_domain())
 		return -ENODEV;
+#endif
 
-	printk(KERN_INFO PFX "Xen WatchDog Timer Driver v%s\n", DRV_VERSION);
+	printk(KERN_INFO "Xen WatchDog Timer Driver v%s\n", DRV_VERSION);
 
 	err = platform_driver_register(&xen_wdt_driver);
 	if (err)
 		return err;
 
 	platform_device = platform_device_register_simple(DRV_NAME,
-								  -1, NULL, 0);
+							  -1, NULL, 0);
 	if (IS_ERR(platform_device)) {
 		err = PTR_ERR(platform_device);
 		platform_driver_unregister(&xen_wdt_driver);
@@ -346,7 +346,7 @@ static void __exit xen_wdt_cleanup_module(void)
 {
 	platform_device_unregister(platform_device);
 	platform_driver_unregister(&xen_wdt_driver);
-	printk(KERN_INFO PFX "module unloaded\n");
+	pr_info("module unloaded\n");
 }
 
 module_init(xen_wdt_init_module);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index a1ced52..c0b28af 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -1,8 +1,374 @@
+#
+# This Kconfig describe xen options
+#
+
+config XEN
+	bool
+
+if XEN
+config XEN_INTERFACE_VERSION
+	hex
+	default 0x00030207
+
+menu "XEN"
+
+config XEN_PRIVILEGED_GUEST
+	bool "Privileged Guest (domain 0)"
+	help
+	  Support for privileged operation (domain 0)
+
+config XEN_UNPRIVILEGED_GUEST
+	def_bool y
+	depends on !XEN_PRIVILEGED_GUEST
+	select PM
+	select SUSPEND
+
+config XEN_DOMCTL
+	tristate
+
+config XEN_XENBUS_DEV
+	def_bool y
+	depends on PROC_FS
+
+config XEN_NETDEV_ACCEL_SFC_UTIL
+    	depends on X86
+	tristate
+
+config XEN_BACKEND
+        tristate "Backend driver support"
+        default XEN_PRIVILEGED_GUEST
+        help
+          Support for backend device drivers that provide I/O services
+          to other virtual machines.
+
+config XEN_BLKDEV_BACKEND
+	tristate "Block-device backend driver"
+	depends on BLOCK && XEN_BACKEND
+	default XEN_BACKEND
+	select XEN_DOMCTL
+	help
+	  The block-device backend driver allows the kernel to export its
+	  block devices to other guests via a high-performance shared-memory
+	  interface.
+
+config XEN_BLKDEV_TAP
+	tristate "Block-device tap backend driver"
+	depends on BLOCK && XEN_BACKEND
+	default XEN_BACKEND
+	select XEN_DOMCTL
+	help
+	  The block tap driver is an alternative to the block back driver
+	  and allows VM block requests to be redirected to userspace through
+	  a device interface.  The tap allows user-space development of
+	  high-performance block backends, where disk images may be implemented
+	  as files, in memory, or on other hosts across the network.  This
+	  driver can safely coexist with the existing blockback driver.
+
+config XEN_BLKDEV_TAP2
+	tristate "Block-device tap backend driver 2"
+	depends on BLOCK && XEN_BACKEND
+	default XEN_BACKEND
+	help
+	  The block tap driver is an alternative to the block back driver
+	  and allows VM block requests to be redirected to userspace through
+	  a device interface.  The tap allows user-space development of
+	  high-performance block backends, where disk images may be implemented
+	  as files, in memory, or on other hosts across the network.  This
+	  driver can safely coexist with the existing blockback driver.
+
+choice
+	prompt "Select blktap2 driver"
+	depends on XEN_BLKDEV_TAP2=y
+
+config XEN_BLKDEV_TAP2_LEGACY
+	bool "legacy"
+
+config XEN_BLKDEV_TAP2_NEW
+	bool "'new'"
+
+endchoice
+
+config XEN_NR_TAP2_DEVICES
+	int "Number of devices the version 2 tap backend driver can handle"
+	range 2 1048575
+	default 1024 if 64BIT
+	default 256
+	depends on XEN_BLKDEV_TAP2
+	help
+	  This sets the number of backend devices the v2 tap backend driver
+	  will be able to handle simultaneously. Note that device 0 is the
+	  control device and hence not available to service guests.
+
+config XEN_BLKBACK_PAGEMAP
+	tristate
+	depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP2 != n
+	default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP2
+
+config XEN_NETDEV_BACKEND
+	tristate "Network-device backend driver"
+        depends on XEN_BACKEND && NET
+	default XEN_BACKEND
+	help
+	  The network-device backend driver allows the kernel to export its
+	  network devices to other guests via a high-performance shared-memory
+	  interface.
+
+config XEN_NETDEV_TX_SHIFT
+	int "Maximum simultaneous transmit requests (as a power of 2)"
+	depends on XEN_NETDEV_BACKEND
+	range 5 15
+	default 8
+	help
+	  The maximum number transmits the driver can hold pending, expressed
+	  as the exponent of a power of 2.
+
+config XEN_NETDEV_PIPELINED_TRANSMITTER
+	bool "Pipelined transmitter (DANGEROUS)"
+	depends on XEN_NETDEV_BACKEND
+	help
+	  If the net backend is a dumb domain, such as a transparent Ethernet
+	  bridge with no local IP interface, it is safe to say Y here to get
+	  slightly lower network overhead.
+	  If the backend has a local IP interface; or may be doing smart things
+	  like reassembling packets to perform firewall filtering; or if you
+	  are unsure; or if you experience network hangs when this option is
+	  enabled; then you must say N here.
+
+config XEN_NETDEV_ACCEL_SFC_BACKEND
+	tristate "Network-device backend driver acceleration for Solarflare NICs"
+	depends on XEN_NETDEV_BACKEND && SFC && SFC_RESOURCE && X86
+	select XEN_NETDEV_ACCEL_SFC_UTIL
+	default m
+
+config XEN_NETDEV_LOOPBACK
+	tristate "Network-device loopback driver"
+	depends on XEN_NETDEV_BACKEND
+	help
+	  A two-interface loopback device to emulate a local netfront-netback
+	  connection. If unsure, it is probably safe to say N here.
+
+config XEN_TPMDEV_BACKEND
+	tristate "TPM-device backend driver"
+        depends on XEN_BACKEND
+	help
+	  The TPM-device backend driver
+
+config XEN_SCSI_BACKEND
+	tristate "SCSI backend driver"
+	depends on SCSI && XEN_BACKEND
+	default m
+	help
+	  The SCSI backend driver allows the kernel to export its SCSI Devices
+	  to other guests via a high-performance shared-memory interface.
+
+config XEN_USB_BACKEND
+	tristate "USB backend driver"
+	depends on USB && XEN_BACKEND
+	default m
+	help
+	  The USB backend driver allows the kernel to export its USB Devices
+	  to other guests.
+
+config XEN_BLKDEV_FRONTEND
+	tristate "Block-device frontend driver"
+	default y
+	help
+	  The block-device frontend driver allows the kernel to access block
+	  devices mounted within another guest OS. Unless you are building a
+	  dedicated device-driver domain, or your master control domain
+	  (domain 0), then you almost certainly want to say Y here.
+
+config XEN_NETDEV_FRONTEND
+	tristate "Network-device frontend driver"
+	depends on NET
+	default y
+	help
+	  The network-device frontend driver allows the kernel to access
+	  network interfaces within another guest OS. Unless you are building a
+	  dedicated device-driver domain, or your master control domain
+	  (domain 0), then you almost certainly want to say Y here.
+
+config XEN_NETDEV_ACCEL_SFC_FRONTEND
+	tristate "Network-device frontend driver acceleration for Solarflare NICs"
+	depends on XEN_NETDEV_FRONTEND && X86
+	select XEN_NETDEV_ACCEL_SFC_UTIL
+	default m
+
+config XEN_SCSI_FRONTEND
+	tristate "SCSI frontend driver"
+	depends on SCSI
+	default m
+	help
+	  The SCSI frontend driver allows the kernel to access SCSI Devices
+	  within another guest OS.
+
+config XEN_USB_FRONTEND
+	tristate "USB frontend driver"
+	depends on USB
+	default m
+	help
+	  The USB frontend driver allows the kernel to access USB Devices
+	  within another guest OS.
+
+config XEN_USB_FRONTEND_HCD_STATS
+	bool "Taking the HCD statistics (for debug)"
+	depends on XEN_USB_FRONTEND
+	default y
+	help
+	  Count the transferred urb status and the RING_FULL occurrence.
+
+config XEN_USB_FRONTEND_HCD_PM
+	bool "HCD suspend/resume support (DO NOT USE)"
+	depends on XEN_USB_FRONTEND
+	default n
+	help
+	  Experimental bus suspend/resume feature support.
+
+config XEN_GRANT_DEV
+	tristate "User-space granted page access driver"
+	depends on XEN_BACKEND != n
+	default XEN_PRIVILEGED_GUEST
+	help
+	  Device for accessing (in user-space) pages that have been granted
+	  by other domains.
+
+config XEN_FRAMEBUFFER
+	tristate "Framebuffer-device frontend driver"
+	depends on FB
+	select FB_CFB_FILLRECT
+	select FB_CFB_COPYAREA
+	select FB_CFB_IMAGEBLIT
+	default y
+	help
+	  The framebuffer-device frontend drivers allows the kernel to create a
+	  virtual framebuffer.  This framebuffer can be viewed in another
+	  domain.  Unless this domain has access to a real video card, you
+	  probably want to say Y here.
+
+config XEN_KEYBOARD
+	tristate "Keyboard-device frontend driver"
+	depends on XEN_FRAMEBUFFER && INPUT
+	default y
+	help
+	  The keyboard-device frontend driver allows the kernel to create a
+	  virtual keyboard.  This keyboard can then be driven by another
+	  domain.  If you've said Y to CONFIG_XEN_FRAMEBUFFER, you probably
+	  want to say Y here.
+
+config XEN_DISABLE_SERIAL
+	bool "Disable serial port drivers"
+	default y
+	help
+	  Disable serial port drivers, allowing the Xen console driver
+	  to provide a serial console at ttyS0.
+
+config XEN_NR_GUEST_DEVICES
+	int "Number of guest devices"
+	range 0 4032 if 64BIT
+	range 0 960
+	default 256 if XEN_BACKEND
+	default 16
+	help
+	  Specify the total number of virtual devices (i.e. both frontend
+	  and backend) that you want the kernel to be able to service.
+
+choice
+	prompt "Xen version compatibility"
+	default XEN_COMPAT_030002_AND_LATER
+
+	config XEN_COMPAT_030002_AND_LATER
+		bool "3.0.2 and later"
+
+	config XEN_COMPAT_030004_AND_LATER
+		bool "3.0.4 and later"
+
+	config XEN_COMPAT_030100_AND_LATER
+		bool "3.1.0 and later"
+
+	config XEN_COMPAT_030200_AND_LATER
+		bool "3.2.0 and later"
+
+	config XEN_COMPAT_030300_AND_LATER
+		bool "3.3.0 and later"
+
+	config XEN_COMPAT_030400_AND_LATER
+		bool "3.4.0 and later"
+
+	config XEN_COMPAT_040000_AND_LATER
+		bool "4.0.0 and later"
+
+	config XEN_COMPAT_040100_AND_LATER
+		bool "4.1.0 and later"
+
+	config XEN_COMPAT_LATEST_ONLY
+		bool "no compatibility code"
+
+endchoice
+
+config XEN_COMPAT
+	hex
+	default 0xffffff if XEN_COMPAT_LATEST_ONLY
+	default 0x040100 if XEN_COMPAT_040100_AND_LATER
+	default 0x040000 if XEN_COMPAT_040000_AND_LATER
+	default 0x030400 if XEN_COMPAT_030400_AND_LATER
+	default 0x030300 if XEN_COMPAT_030300_AND_LATER
+	default 0x030200 if XEN_COMPAT_030200_AND_LATER
+	default 0x030100 if XEN_COMPAT_030100_AND_LATER
+	default 0x030004 if XEN_COMPAT_030004_AND_LATER
+	default 0x030002 if XEN_COMPAT_030002_AND_LATER
+	default 0
+
+config XEN_VCPU_INFO_PLACEMENT
+	bool "Place shared vCPU info in per-CPU storage"
+#	depends on X86 && (XEN_COMPAT >= 0x00030101)
+	depends on X86
+	depends on !XEN_COMPAT_030002_AND_LATER
+	depends on !XEN_COMPAT_030004_AND_LATER
+	depends on !XEN_COMPAT_030100_AND_LATER
+	default SMP
+	---help---
+	  This allows faster access to the per-vCPU shared info
+	  structure.
+
+endmenu
+
+config HAVE_IRQ_IGNORE_UNHANDLED
+	def_bool y
+
+config ARCH_HAS_WALK_MEMORY
+	def_bool y
+	depends on X86
+
+config XEN_SMPBOOT
+	def_bool y
+	depends on SMP && !PPC_XEN
+
+config XEN_SPINLOCK_ACQUIRE_NESTING
+	int "maximum nesting level for acquiring spin locks"
+	# Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
+	depends on !XEN_COMPAT_030002_AND_LATER
+	depends on !XEN_COMPAT_030004_AND_LATER
+	depends on !XEN_COMPAT_030100_AND_LATER
+	range 0 3 if EXPERIMENTAL
+	range 0 1
+	default 0
+	help
+	  IRQ-safe spin lock acquire operations can re-enable interrupts
+	  before entering polling mode, to reduce interrupt latencies.
+	  This option specifies how many times this can be done for each
+	  individual spin lock (0 disables this behavior).
+
+config XEN_DEVMEM
+	def_bool y
+
+endif
+
 menu "Xen driver support"
-	depends on XEN
+	depends on XEN || PARAVIRT_XEN
 
 config XEN_BALLOON
-	bool "Xen memory balloon driver"
+	bool "Xen memory balloon driver" if PARAVIRT_XEN
+	depends on PARAVIRT_XEN || !PPC_XEN
 	default y
 	help
 	  The balloon driver allows the Xen domain to request more memory from
@@ -11,8 +377,7 @@ config XEN_BALLOON
 
 config XEN_SELFBALLOONING
 	bool "Dynamically self-balloon kernel memory to target"
-	depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM
-	default n
+	depends on XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM
 	help
 	  Self-ballooning dynamically balloons available kernel memory driven
 	  by the current usage of anonymous memory ("committed AS") and
@@ -29,7 +394,7 @@ config XEN_SELFBALLOONING
 config XEN_BALLOON_MEMORY_HOTPLUG
 	bool "Memory hotplug support for Xen balloon driver"
 	default n
-	depends on XEN_BALLOON && MEMORY_HOTPLUG
+	depends on PARAVIRT_XEN && XEN_BALLOON && MEMORY_HOTPLUG
 	help
 	  Memory hotplug support for Xen balloon driver allows expanding memory
 	  available for the system above limit declared at system startup.
@@ -57,26 +422,28 @@ config XEN_BALLOON_MEMORY_HOTPLUG
 	  In that case step 3 should be omitted.
 
 config XEN_SCRUB_PAGES
-	bool "Scrub pages before returning them to system"
-	depends on XEN_BALLOON
+	bool "Scrub memory before freeing it to Xen"
+	depends on XEN || XEN_BALLOON
 	default y
 	help
-	  Scrub pages before returning them to the system for reuse by
-	  other domains.  This makes sure that any confidential data
-	  is not accidentally visible to other domains.  Is it more
-	  secure, but slightly less efficient.
+	  Erase memory contents before freeing it back to Xen's global
+	  pool. This ensures that any secrets contained within that
+	  memory (e.g., private keys) cannot be found by other guests that
+	  may be running on the machine. Most people will want to say Y here.
+	  If security is not a concern then you may increase performance by
+	  saying N.
 	  If in doubt, say yes.
 
 config XEN_DEV_EVTCHN
 	tristate "Xen /dev/xen/evtchn device"
-	default y
+	default PARAVIRT_XEN || XEN_PRIVILEGED_GUEST || m
 	help
 	  The evtchn driver allows a userspace process to triger event
 	  channels and to receive notification of an event channel
 	  firing.
 	  If in doubt, say yes.
 
-config XEN_BACKEND
+config PARAVIRT_XEN_BACKEND
 	bool "Backend driver support"
 	depends on XEN_DOM0
 	default y
@@ -86,6 +453,7 @@ config XEN_BACKEND
 
 config XENFS
 	tristate "Xen filesystem"
+	depends on PARAVIRT_XEN
 	select XEN_PRIVCMD
 	default y
 	help
@@ -123,7 +491,7 @@ config XEN_XENBUS_FRONTEND
 
 config XEN_GNTDEV
 	tristate "userspace grant access device driver"
-	depends on XEN
+	depends on PARAVIRT_XEN
 	default m
 	select MMU_NOTIFIER
 	help
@@ -131,7 +499,7 @@ config XEN_GNTDEV
 
 config XEN_GRANT_DEV_ALLOC
 	tristate "User-space grant reference allocator driver"
-	depends on XEN
+	depends on PARAVIRT_XEN
 	default m
 	help
 	  Allows userspace processes to create pages with access granted
@@ -140,9 +508,12 @@ config XEN_GRANT_DEV_ALLOC
 
 config SWIOTLB_XEN
 	def_bool y
-	depends on PCI
+	depends on PARAVIRT_XEN && PCI
 	select SWIOTLB
 
+config XEN_XENCOMM
+	bool
+
 config XEN_TMEM
 	bool
 	default y if (CLEANCACHE || FRONTSWAP)
@@ -152,8 +523,8 @@ config XEN_TMEM
 
 config XEN_PCIDEV_BACKEND
 	tristate "Xen PCI-device backend driver"
-	depends on PCI && X86 && XEN
-	depends on XEN_BACKEND
+	depends on PCI && ((X86 && PARAVIRT_XEN_BACKEND) || (XEN_PRIVILEGED_GUEST && XEN_BACKEND))
+	default XEN_BACKEND if XEN
 	default m
 	help
 	  The PCI device backend driver allows the kernel to export arbitrary
@@ -161,11 +532,6 @@ config XEN_PCIDEV_BACKEND
 	  will need to make sure no other driver has bound to the device(s)
 	  you want to make visible to other guests.
 
-	  The parameter "passthrough" allows you specify how you want the PCI
-	  devices to appear in the guest. You can choose the default (0) where
-	  PCI topology starts at 00.00.0, or (1) for passthrough if you want
-	  the PCI devices topology appear the same as in the host.
-
 	  The "hide" parameter (only applicable if backend driver is compiled
 	  into the kernel) allows you to bind the PCI devices to this module
 	  from the default device drivers. The argument is the list of PCI BDFs:
@@ -173,9 +539,101 @@ config XEN_PCIDEV_BACKEND
 
 	  If in doubt, say m.
 
+menu "PCI Backend Mode"
+	depends on XEN_PCIDEV_BACKEND
+
+choice
+	prompt "Default PCI backend mode"
+	default XEN_PCIDEV_BACKEND_DEFAULT_CONTROLLER if IA64
+	default XEN_PCIDEV_BACKEND_DEFAULT_VPCI
+
+config XEN_PCIDEV_BACKEND_DEFAULT_VPCI
+	bool "Virtual PCI"
+	select XEN_PCIDEV_BACKEND_VPCI
+
+config XEN_PCIDEV_BACKEND_DEFAULT_PASSTHROUGH
+	bool "Passthrough"
+	select XEN_PCIDEV_BACKEND_PASSTHROUGH
+
+config XEN_PCIDEV_BACKEND_DEFAULT_SLOT
+	bool "Slot"
+	select XEN_PCIDEV_BACKEND_SLOT
+
+config XEN_PCIDEV_BACKEND_DEFAULT_CONTROLLER
+	bool "Controller"
+	depends on IA64
+	select XEN_PCIDEV_BACKEND_CONTROLLER
+
+endchoice
+
+config XEN_PCIDEV_BACKEND_DEFAULT
+	string
+	default "vpci" if XEN_PCIDEV_BACKEND_DEFAULT_VPCI
+	default "passthrough" if XEN_PCIDEV_BACKEND_DEFAULT_PASSTHROUGH
+	default "slot" if XEN_PCIDEV_BACKEND_DEFAULT_SLOT
+	default "controller" if XEN_PCIDEV_BACKEND_DEFAULT_CONTROLLER
+
+config XEN_PCIDEV_BACKEND_VPCI
+	bool "Virtual PCI"
+	default X86
+	---help---
+	  This PCI Backend hides the true PCI topology and makes the frontend
+	  think there is a single PCI bus with only the exported devices on it.
+	  For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
+	  second device at 02:1a.1 will be re-assigned to 00:01.1.
+
+	  If not the default, the parameter "mode=vpci" allows you to use this
+	  mode.
+
+config XEN_PCIDEV_BACKEND_PASSTHROUGH
+	bool "Passthrough"
+	---help---
+	  This PCI Backend provides a real view of the PCI topology to the
+	  frontend (for example, a device at 06:01.b will still appear at
+	  06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
+	  PCI devices to its driver domains. This may be required for drivers
+	  which depend on finding their hardward in certain bus/slot
+	  locations.
+
+	  If not the default, the parameter "mode=passthrough" allows you to
+	  use this mode.
+
+config XEN_PCIDEV_BACKEND_SLOT
+	bool "Slot"
+	---help---
+	  This PCI Backend hides the true PCI topology and makes the frontend
+	  think there is a single PCI bus with only the exported devices on it.
+	  Contrary to the virtual PCI backend, a function becomes a new slot.
+	  For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
+	  second device at 02:1a.1 will be re-assigned to 00:01.0.
+
+	  If not the default, the parameter "mode=slot" allows you to use this
+	  mode.
+
+config XEN_PCIDEV_BACKEND_CONTROLLER
+	bool "Controller"
+	depends on IA64
+	---help---
+	  This PCI backend virtualizes the PCI bus topology by providing a
+	  virtual bus per PCI root device.  Devices which are physically under
+	  the same root bus will appear on the same virtual bus.  For systems
+	  with complex I/O addressing, this is the only backend which supports
+	  extended I/O port spaces and MMIO translation offsets.  This backend
+	  also supports slot virtualization.  For example, a device at
+	  0000:01:02.1 will be re-assigned to 0000:00:00.0.  A second device
+	  at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
+	  re-assigned to 0000:00:01.0.  A third device at 0000:16:05.0 (under
+	  a different PCI root bus) will be re-assigned to 0000:01:00.0.
+
+	  If not the default, the parameter "mode=controller" allows you to
+	  use this mode.
+
+endmenu
+
 config XEN_PRIVCMD
 	tristate
-	depends on XEN
+	depends on PARAVIRT_XEN || (XEN && PROC_FS)
+	default y if XEN
 	default m
 
 endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index aa31337..f671a16 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,27 +1,68 @@
-obj-y	+= grant-table.o features.o events.o manage.o balloon.o
-obj-y	+= xenbus/
+obj-$(CONFIG_PARAVIRT_XEN)	+= grant-table.o features.o events.o manage.o balloon.o
+xen-biomerge-$(CONFIG_PARAVIRT_XEN) := biomerge.o
+xen-hotplug-$(CONFIG_PARAVIRT_XEN) := cpu_hotplug.o
+xen-balloon_$(CONFIG_PARAVIRT_XEN) := xen-balloon.o
+xen-evtchn-name-$(CONFIG_PARAVIRT_XEN) := xen-evtchn
+xen-privcmd_$(CONFIG_PARAVIRT_XEN) := xen-privcmd.o
+
+xen-balloon_$(CONFIG_XEN)	:= balloon/
+xen-privcmd_$(CONFIG_XEN)	:= privcmd/
+obj-$(CONFIG_XEN)		+= core/
+obj-$(CONFIG_XEN)		+= console/
+obj-y				+= xenbus/
+obj-$(CONFIG_XEN)		+= char/
+
+xen-backend-$(CONFIG_XEN_BACKEND)	:= util.o
+xen-evtchn-name-$(CONFIG_XEN)		:= evtchn
 
 nostackp := $(call cc-option, -fno-stack-protector)
+ifeq ($(CONFIG_PARAVIRT_XEN),y)
 CFLAGS_features.o			:= $(nostackp)
+endif
+
+priv-$(CONFIG_PCI)			:= pci.o
 
-obj-$(CONFIG_BLOCK)			+= biomerge.o
-obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
+obj-$(CONFIG_XEN)			+= features.o $(xen-backend-y) $(xen-backend-m)
+obj-$(CONFIG_XEN_PRIVILEGED_GUEST)	+= $(priv-y)
+obj-$(CONFIG_BLOCK)			+= $(xen-biomerge-y)
+obj-$(CONFIG_HOTPLUG_CPU)		+= $(xen-hotplug-y)
 obj-$(CONFIG_XEN_XENCOMM)		+= xencomm.o
-obj-$(CONFIG_XEN_BALLOON)		+= xen-balloon.o
+obj-$(CONFIG_XEN_BALLOON)		+= $(xen-balloon_y)
 obj-$(CONFIG_XEN_SELFBALLOONING)	+= xen-selfballoon.o
-obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o
+obj-$(CONFIG_XEN_DEV_EVTCHN)		+= $(xen-evtchn-name-y).o
 obj-$(CONFIG_XEN_GNTDEV)		+= xen-gntdev.o
-obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)	+= xen-gntalloc.o
 obj-$(CONFIG_XENFS)			+= xenfs/
+obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)	+= xen-gntalloc.o
 obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
 obj-$(CONFIG_XEN_PVHVM)			+= platform-pci.o
 obj-$(CONFIG_XEN_TMEM)			+= tmem.o
 obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)			+= pci.o
 obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
-obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o
+obj-$(CONFIG_XEN_PRIVCMD)		+= $(xen-privcmd_y)
 
 xen-evtchn-y				:= evtchn.o
 xen-gntdev-y				:= gntdev.o
 xen-gntalloc-y				:= gntalloc.o
 xen-privcmd-y				:= privcmd.o
+
+obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
+obj-$(CONFIG_XEN_BLKDEV_TAP)		+= blktap/
+obj-$(filter m,$(CONFIG_XEN_BLKDEV_TAP2)) += blktap2/ blktap2-new/
+obj-$(CONFIG_XEN_BLKDEV_TAP2_LEGACY)	+= blktap2/
+obj-$(CONFIG_XEN_BLKDEV_TAP2_NEW)	+= blktap2-new/
+obj-$(CONFIG_XEN_NETDEV_BACKEND)	+= netback/
+obj-$(CONFIG_XEN_TPMDEV_BACKEND)	+= tpmback/
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= blkfront/
+obj-$(CONFIG_XEN_NETDEV_FRONTEND)	+= netfront/
+obj-$(CONFIG_XEN_PCIDEV_FRONTEND)	+= pcifront/
+obj-$(CONFIG_XEN_FRAMEBUFFER)		+= fbfront/
+obj-$(CONFIG_XEN_KEYBOARD)		+= fbfront/
+obj-$(CONFIG_XEN_SCSI_BACKEND)		+= scsiback/
+obj-$(CONFIG_XEN_SCSI_FRONTEND)		+= scsifront/
+obj-$(CONFIG_XEN_USB_BACKEND)		+= usbback/
+obj-$(CONFIG_XEN_USB_FRONTEND)		+= usbfront/
+obj-$(CONFIG_XEN_GRANT_DEV)	+= gntdev/
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL)		+= sfc_netutil/
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND)	+= sfc_netfront/
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND)	+= sfc_netback/
diff --git a/drivers/xen/balloon/Makefile b/drivers/xen/balloon/Makefile
new file mode 100644
index 0000000..3fc3d0b
--- /dev/null
+++ b/drivers/xen/balloon/Makefile
@@ -0,0 +1,2 @@
+
+obj-y := balloon.o sysfs.o
diff --git a/drivers/xen/balloon/balloon.c b/drivers/xen/balloon/balloon.c
new file mode 100644
index 0000000..947e9ef
--- /dev/null
+++ b/drivers/xen/balloon/balloon.c
@@ -0,0 +1,804 @@
+/******************************************************************************
+ * balloon.c
+ *
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <xen/xen_proc.h>
+#include <asm/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/interface/memory.h>
+#include <asm/maddr.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry *balloon_pde;
+#endif
+
+static DEFINE_MUTEX(balloon_mutex);
+
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+DEFINE_SPINLOCK(balloon_lock);
+
+struct balloon_stats balloon_stats;
+
+/* We increase/decrease in batches which fit in a page */
+static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+
+#ifdef CONFIG_HIGHMEM
+#define inc_totalhigh_pages() (totalhigh_pages++)
+#define dec_totalhigh_pages() (totalhigh_pages--)
+#else
+#define inc_totalhigh_pages() ((void)0)
+#define dec_totalhigh_pages() ((void)0)
+#endif
+
+#ifndef CONFIG_XEN
+/*
+ * In HVM guests accounting here uses the Xen visible values, but the kernel
+ * determined totalram_pages value shouldn't get altered. Since totalram_pages
+ * includes neither the kernel static image nor any memory allocated prior to
+ * or from the bootmem allocator, we have to synchronize the two values.
+ */
+static unsigned long __read_mostly totalram_bias;
+#else
+#define totalram_bias 0
+#endif
+
+/* List of ballooned pages, threaded through the mem_map array. */
+static LIST_HEAD(ballooned_pages);
+
+/* Main work function, always executed in process context. */
+static void balloon_process(struct work_struct *unused);
+static DECLARE_WORK(balloon_worker, balloon_process);
+
+/* When ballooning out (allocating memory to return to Xen) we don't really 
+   want the kernel to try too hard since that can trigger the oom killer. */
+#define GFP_BALLOON (GFP_HIGHUSER|__GFP_NOWARN|__GFP_NORETRY|__GFP_NOMEMALLOC|\
+		     __GFP_NOTRACK|__GFP_COLD)
+
+#define PAGE_TO_LIST(p) (&(p)->lru)
+#define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
+#define UNLIST_PAGE(p)				\
+	do {					\
+		list_del(PAGE_TO_LIST(p));	\
+		PAGE_TO_LIST(p)->next = NULL;	\
+		PAGE_TO_LIST(p)->prev = NULL;	\
+	} while(0)
+
+#define IPRINTK(fmt, args...) pr_info("xen_mem: " fmt, ##args)
+#define WPRINTK(fmt, args...) pr_warning("xen_mem: " fmt, ##args)
+
+/* balloon_append: add the given page to the balloon. */
+static void balloon_append(struct page *page, int account)
+{
+	unsigned long pfn;
+
+	/* Lowmem is re-populated first, so highmem pages go at list tail. */
+	if (PageHighMem(page)) {
+		list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
+		bs.balloon_high++;
+		if (account)
+			dec_totalhigh_pages();
+	} else {
+		list_add(PAGE_TO_LIST(page), &ballooned_pages);
+		bs.balloon_low++;
+	}
+
+	pfn = page_to_pfn(page);
+	if (account) {
+		SetPageReserved(page);
+		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+		page_zone(page)->present_pages--;
+	} else {
+		BUG_ON(!PageReserved(page));
+		WARN_ON_ONCE(phys_to_machine_mapping_valid(pfn));
+	}
+}
+
+/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
+static struct page *balloon_retrieve(int *was_empty)
+{
+	struct page *page;
+	struct zone *zone;
+
+	if (list_empty(&ballooned_pages))
+		return NULL;
+
+	page = LIST_TO_PAGE(ballooned_pages.next);
+	UNLIST_PAGE(page);
+	BUG_ON(!PageReserved(page));
+
+	if (PageHighMem(page)) {
+		bs.balloon_high--;
+		inc_totalhigh_pages();
+	}
+	else
+		bs.balloon_low--;
+	zone = page_zone(page);
+	*was_empty |= !populated_zone(zone);
+	zone->present_pages++;
+
+	return page;
+}
+
+static struct page *balloon_first_page(void)
+{
+	if (list_empty(&ballooned_pages))
+		return NULL;
+	return LIST_TO_PAGE(ballooned_pages.next);
+}
+
+static struct page *balloon_next_page(struct page *page)
+{
+	struct list_head *next = PAGE_TO_LIST(page)->next;
+	if (next == &ballooned_pages)
+		return NULL;
+	return LIST_TO_PAGE(next);
+}
+
+static inline void balloon_free_page(struct page *page)
+{
+#ifndef MODULE
+	if (put_page_testzero(page))
+		free_hot_cold_page(page, 1);
+#else
+	/* free_hot_cold_page() is not being exported. */
+	__free_page(page);
+#endif
+}
+
+static void balloon_alarm(unsigned long unused)
+{
+	schedule_work(&balloon_worker);
+}
+static DEFINE_TIMER(balloon_timer, balloon_alarm, 0, 0);
+
+static unsigned long current_target(void)
+{
+	unsigned long target = bs.target_pages;
+	if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
+		target = bs.current_pages + bs.balloon_low + bs.balloon_high;
+	return target;
+}
+
+unsigned long balloon_minimum_target(void)
+{
+#ifndef CONFIG_XEN
+#define max_pfn num_physpages
+#endif
+	unsigned long min_pages, curr_pages = current_target();
+
+#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
+	/* Simple continuous piecewiese linear function:
+	 *  max MiB -> min MiB	gradient
+	 *       0	   0
+	 *      16	  16
+	 *      32	  24
+	 *     128	  72	(1/2)
+	 *     512 	 168	(1/4)
+	 *    2048	 360	(1/8)
+	 *    8192	 552	(1/32)
+	 *   32768	1320
+	 *  131072	4392
+	 */
+	if (max_pfn < MB2PAGES(128))
+		min_pages = MB2PAGES(8) + (max_pfn >> 1);
+	else if (max_pfn < MB2PAGES(512))
+		min_pages = MB2PAGES(40) + (max_pfn >> 2);
+	else if (max_pfn < MB2PAGES(2048))
+		min_pages = MB2PAGES(104) + (max_pfn >> 3);
+	else
+		min_pages = MB2PAGES(296) + (max_pfn >> 5);
+#undef MB2PAGES
+
+	/* Don't enforce growth */
+	return min(min_pages, curr_pages);
+#ifndef CONFIG_XEN
+#undef max_pfn
+#endif
+}
+
+static int increase_reservation(unsigned long nr_pages)
+{
+	unsigned long  pfn, i, flags;
+	struct page   *page;
+	long           rc;
+	int            need_zonelists_rebuild = 0;
+	struct xen_memory_reservation reservation = {
+		.address_bits = 0,
+		.extent_order = 0,
+		.domid        = DOMID_SELF
+	};
+
+	if (nr_pages > ARRAY_SIZE(frame_list))
+		nr_pages = ARRAY_SIZE(frame_list);
+
+	balloon_lock(flags);
+
+	page = balloon_first_page();
+	for (i = 0; i < nr_pages; i++) {
+		BUG_ON(page == NULL);
+		frame_list[i] = page_to_pfn(page);;
+		page = balloon_next_page(page);
+	}
+
+	set_xen_guest_handle(reservation.extent_start, frame_list);
+	reservation.nr_extents = nr_pages;
+	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+	if (rc < 0)
+		goto out;
+
+	for (i = 0; i < rc; i++) {
+		page = balloon_retrieve(&need_zonelists_rebuild);
+		BUG_ON(page == NULL);
+
+		pfn = page_to_pfn(page);
+		BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
+		       phys_to_machine_mapping_valid(pfn));
+
+		set_phys_to_machine(pfn, frame_list[i]);
+
+#ifdef CONFIG_XEN
+		/* Link back into the page tables if not highmem. */
+		if (pfn < max_low_pfn) {
+			int ret;
+			ret = HYPERVISOR_update_va_mapping(
+				(unsigned long)__va(pfn << PAGE_SHIFT),
+				pfn_pte_ma(frame_list[i], PAGE_KERNEL),
+				0);
+			BUG_ON(ret);
+		}
+#endif
+
+		/* Relinquish the page back to the allocator. */
+		ClearPageReserved(page);
+		init_page_count(page);
+		balloon_free_page(page);
+	}
+
+	bs.current_pages += rc;
+	totalram_pages = bs.current_pages - totalram_bias;
+
+ out:
+	balloon_unlock(flags);
+
+#ifndef MODULE
+	setup_per_zone_wmarks();
+	if (rc > 0)
+		kswapd_run(0);
+	if (need_zonelists_rebuild)
+		build_all_zonelists(NULL);
+	else
+		vm_total_pages = nr_free_pagecache_pages();
+#endif
+
+	return rc < 0 ? rc : rc != nr_pages;
+}
+
+static int decrease_reservation(unsigned long nr_pages)
+{
+	unsigned long  pfn, i, flags;
+	struct page   *page;
+	void          *v;
+	int            need_sleep = 0;
+	int ret;
+	struct xen_memory_reservation reservation = {
+		.address_bits = 0,
+		.extent_order = 0,
+		.domid        = DOMID_SELF
+	};
+
+	if (nr_pages > ARRAY_SIZE(frame_list))
+		nr_pages = ARRAY_SIZE(frame_list);
+
+	for (i = 0; i < nr_pages; i++) {
+		if ((page = alloc_page(GFP_BALLOON)) == NULL) {
+			nr_pages = i;
+			need_sleep = 1;
+			break;
+		}
+
+		pfn = page_to_pfn(page);
+		frame_list[i] = pfn_to_mfn(pfn);
+
+		if (!PageHighMem(page)) {
+			v = phys_to_virt(pfn << PAGE_SHIFT);
+			scrub_pages(v, 1);
+#ifdef CONFIG_XEN
+			ret = HYPERVISOR_update_va_mapping(
+				(unsigned long)v, __pte_ma(0), 0);
+			BUG_ON(ret);
+#endif
+		}
+#ifdef CONFIG_XEN_SCRUB_PAGES
+		else {
+			v = kmap(page);
+			scrub_pages(v, 1);
+			kunmap(page);
+		}
+#endif
+	}
+
+#ifdef CONFIG_XEN
+	/* Ensure that ballooned highmem pages don't have kmaps. */
+	kmap_flush_unused();
+	flush_tlb_all();
+#endif
+
+	balloon_lock(flags);
+
+	/* No more mappings: invalidate P2M and add to balloon. */
+	for (i = 0; i < nr_pages; i++) {
+		pfn = mfn_to_pfn(frame_list[i]);
+		balloon_append(pfn_to_page(pfn), 1);
+	}
+
+	set_xen_guest_handle(reservation.extent_start, frame_list);
+	reservation.nr_extents   = nr_pages;
+	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+	BUG_ON(ret != nr_pages);
+
+	bs.current_pages -= nr_pages;
+	totalram_pages = bs.current_pages - totalram_bias;
+
+	balloon_unlock(flags);
+
+	return need_sleep;
+}
+
+/*
+ * We avoid multiple worker processes conflicting via the balloon mutex.
+ * We may of course race updates of the target counts (which are protected
+ * by the balloon lock), or with changes to the Xen hard limit, but we will
+ * recover from these in time.
+ */
+static void balloon_process(struct work_struct *unused)
+{
+	int need_sleep = 0;
+	long credit;
+
+	mutex_lock(&balloon_mutex);
+
+	do {
+		credit = current_target() - bs.current_pages;
+		if (credit > 0)
+			need_sleep = (increase_reservation(credit) != 0);
+		if (credit < 0)
+			need_sleep = (decrease_reservation(-credit) != 0);
+
+#ifndef CONFIG_PREEMPT
+		if (need_resched())
+			schedule();
+#endif
+	} while ((credit != 0) && !need_sleep);
+
+	/* Schedule more work if there is some still to be done. */
+	if (current_target() != bs.current_pages)
+		mod_timer(&balloon_timer, jiffies + HZ);
+
+	mutex_unlock(&balloon_mutex);
+}
+
+/* Resets the Xen limit, sets new target, and kicks off processing. */
+void balloon_set_new_target(unsigned long target)
+{
+	/* No need for lock. Not read-modify-write updates. */
+	bs.target_pages = max(target, balloon_minimum_target());
+	schedule_work(&balloon_worker);
+}
+
+static struct xenbus_watch target_watch =
+{
+	.node = "memory/target"
+};
+
+/* React to a change in the target key */
+static void watch_target(struct xenbus_watch *watch,
+			 const char **vec, unsigned int len)
+{
+	unsigned long long new_target;
+	int err;
+
+	err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
+	if (err != 1) {
+		/* This is ok (for domain0 at least) - so just return */
+		return;
+	}
+
+	/* The given memory/target value is in KiB, so it needs converting to
+	 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+	 */
+	balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+}
+
+static int balloon_init_watcher(struct notifier_block *notifier,
+				unsigned long event,
+				void *data)
+{
+	int err;
+
+	err = register_xenbus_watch(&target_watch);
+	if (err)
+		pr_err("Failed to set balloon watcher\n");
+
+	return NOTIFY_DONE;
+}
+
+#ifdef CONFIG_PROC_FS
+static int balloon_write(struct file *file, const char __user *buffer,
+			 unsigned long count, void *data)
+{
+	char memstring[64], *endchar;
+	unsigned long long target_bytes;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (count <= 1)
+		return -EBADMSG; /* runt */
+	if (count > sizeof(memstring))
+		return -EFBIG;   /* too long */
+
+	if (copy_from_user(memstring, buffer, count))
+		return -EFAULT;
+	memstring[sizeof(memstring)-1] = '\0';
+
+	target_bytes = memparse(memstring, &endchar);
+	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+	return count;
+}
+
+static int balloon_read(char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	int len;
+
+	len = sprintf(
+		page,
+		"Current allocation: %8lu kB\n"
+		"Requested target:   %8lu kB\n"
+		"Minimum target:     %8lu kB\n"
+		"Maximum target:     %8lu kB\n"
+		"Low-mem balloon:    %8lu kB\n"
+		"High-mem balloon:   %8lu kB\n"
+		"Driver pages:       %8lu kB\n",
+		PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages), 
+		PAGES2KB(balloon_minimum_target()), PAGES2KB(num_physpages),
+		PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high),
+		PAGES2KB(bs.driver_pages));
+
+
+	*eof = 1;
+	return len;
+}
+#endif
+
+static struct notifier_block xenstore_notifier;
+
+static int __init balloon_init(void)
+{
+#if !defined(CONFIG_XEN)
+# ifndef XENMEM_get_pod_target
+#  define XENMEM_get_pod_target 17
+	typedef struct xen_pod_target {
+		uint64_t target_pages;
+		uint64_t tot_pages;
+		uint64_t pod_cache_pages;
+		uint64_t pod_entries;
+		domid_t domid;
+	} xen_pod_target_t;
+# endif
+	xen_pod_target_t pod_target = { .domid = DOMID_SELF };
+	int rc;
+#elif defined(CONFIG_X86)
+	unsigned long pfn;
+	struct page *page;
+#endif
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	IPRINTK("Initialising balloon driver.\n");
+
+#ifdef CONFIG_XEN
+	bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
+	totalram_pages   = bs.current_pages;
+#else 
+	rc = HYPERVISOR_memory_op(XENMEM_get_pod_target, &pod_target);
+	/*
+	 * Xen prior to 3.4.0 masks the memory_op command to 4 bits, thus
+	 * converting XENMEM_get_pod_target to XENMEM_decrease_reservation.
+	 * Fortunately this results in a request with all input fields zero,
+	 * but (due to the way bit 4 and upwards get interpreted) a starting
+	 * extent of 1. When start_extent > nr_extents (>= in newer Xen), we
+	 * simply get start_extent returned.
+	 */
+	totalram_bias = HYPERVISOR_memory_op(rc != -ENOSYS && rc != 1
+		? XENMEM_maximum_reservation : XENMEM_current_reservation,
+		&pod_target.domid);
+	if ((long)totalram_bias != -ENOSYS) {
+		BUG_ON(totalram_bias < totalram_pages);
+		bs.current_pages = totalram_bias;
+		totalram_bias -= totalram_pages;
+	} else {
+		totalram_bias = 0;
+		bs.current_pages = totalram_pages;
+	}
+#endif
+	bs.target_pages  = bs.current_pages;
+	bs.balloon_low   = 0;
+	bs.balloon_high  = 0;
+	bs.driver_pages  = 0UL;
+
+#ifdef CONFIG_PROC_FS
+	if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
+		WPRINTK("Unable to create /proc/xen/balloon.\n");
+		return -1;
+	}
+
+	balloon_pde->read_proc  = balloon_read;
+	balloon_pde->write_proc = balloon_write;
+#endif
+	balloon_sysfs_init();
+
+#if defined(CONFIG_X86) && defined(CONFIG_XEN) 
+	/* Initialise the balloon with excess memory space. */
+	for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+		page = pfn_to_page(pfn);
+		if (!PageReserved(page)) {
+			SetPageReserved(page);
+			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+			balloon_append(page, 0);
+		}
+	}
+#endif
+
+	target_watch.callback = watch_target;
+	xenstore_notifier.notifier_call = balloon_init_watcher;
+
+	register_xenstore_notifier(&xenstore_notifier);
+    
+	return 0;
+}
+
+subsys_initcall(balloon_init);
+
+static void __exit balloon_exit(void)
+{
+	balloon_sysfs_exit();
+	/* XXX - release balloon here */
+}
+
+module_exit(balloon_exit); 
+
+void balloon_update_driver_allowance(long delta)
+{
+	unsigned long flags;
+
+	balloon_lock(flags);
+	bs.driver_pages += delta;
+	balloon_unlock(flags);
+}
+EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+
+#ifdef CONFIG_XEN
+static int dealloc_pte_fn(
+	pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+	unsigned long pfn, mfn = pte_mfn(*pte);
+	int ret;
+	struct xen_memory_reservation reservation = {
+		.nr_extents   = 1,
+		.extent_order = 0,
+		.domid        = DOMID_SELF
+	};
+	set_xen_guest_handle(reservation.extent_start, &mfn);
+	set_pte_at(&init_mm, addr, pte, __pte_ma(0));
+	pfn = __pa(addr) >> PAGE_SHIFT;
+	set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+	SetPageReserved(pfn_to_page(pfn));
+	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+	BUG_ON(ret != 1);
+	return 0;
+}
+#endif
+
+struct page **alloc_empty_pages_and_pagevec(int nr_pages)
+{
+	unsigned long flags;
+	void *v;
+	struct page *page, **pagevec;
+	int i, ret;
+
+	pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
+	if (pagevec == NULL)
+		return NULL;
+
+	for (i = 0; i < nr_pages; i++) {
+		balloon_lock(flags);
+		page = balloon_first_page();
+		if (page && !PageHighMem(page)) {
+			UNLIST_PAGE(page);
+			bs.balloon_low--;
+			balloon_unlock(flags);
+			pagevec[i] = page;
+			continue;
+		}
+		balloon_unlock(flags);
+
+		page = pagevec[i] = alloc_page(GFP_KERNEL|__GFP_NOTRACK|__GFP_COLD);
+		if (page == NULL)
+			goto err;
+
+		v = page_address(page);
+		scrub_pages(v, 1);
+
+		balloon_lock(flags);
+
+		if (xen_feature(XENFEAT_auto_translated_physmap)) {
+			unsigned long gmfn = page_to_pfn(page);
+			struct xen_memory_reservation reservation = {
+				.nr_extents   = 1,
+				.extent_order = 0,
+				.domid        = DOMID_SELF
+			};
+			set_xen_guest_handle(reservation.extent_start, &gmfn);
+			ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+						   &reservation);
+			if (ret == 1)
+				ret = 0; /* success */
+		} else {
+#ifdef CONFIG_XEN
+			ret = apply_to_page_range(&init_mm, (unsigned long)v,
+						  PAGE_SIZE, dealloc_pte_fn,
+						  NULL);
+#else
+			/* Cannot handle non-auto translate mode. */
+			ret = 1;
+#endif
+		}
+
+		if (ret != 0) {
+			balloon_free_page(page);
+			balloon_unlock(flags);
+			goto err;
+		}
+
+		totalram_pages = --bs.current_pages - totalram_bias;
+		if (PageHighMem(page))
+			dec_totalhigh_pages();
+		page_zone(page)->present_pages--;
+
+		balloon_unlock(flags);
+	}
+
+ out:
+	schedule_work(&balloon_worker);
+#ifdef CONFIG_XEN
+	flush_tlb_all();
+#endif
+	return pagevec;
+
+ err:
+	balloon_lock(flags);
+	while (--i >= 0)
+		balloon_append(pagevec[i], 0);
+	balloon_unlock(flags);
+	kfree(pagevec);
+	pagevec = NULL;
+	goto out;
+}
+EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
+
+#endif /* CONFIG_XEN_BACKEND */
+
+#ifdef CONFIG_XEN
+static void _free_empty_pages(struct page **pagevec, int nr_pages,
+			      bool account)
+{
+	unsigned long flags;
+	int i;
+
+	balloon_lock(flags);
+	for (i = 0; i < nr_pages; i++) {
+		BUG_ON(page_count(pagevec[i]) != 1);
+		balloon_append(pagevec[i], account);
+	}
+	if (account) {
+		bs.current_pages -= nr_pages;
+		totalram_pages = bs.current_pages - totalram_bias;
+	}
+	balloon_unlock(flags);
+
+	schedule_work(&balloon_worker);
+}
+
+void free_empty_pages(struct page **pagevec, int nr_pages)
+{
+	_free_empty_pages(pagevec, nr_pages, true);
+}
+#endif
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
+{
+	if (pagevec) {
+		_free_empty_pages(pagevec, nr_pages, false);
+		kfree(pagevec);
+	}
+}
+EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
+#endif
+
+void balloon_release_driver_page(struct page *page)
+{
+	unsigned long flags;
+
+	balloon_lock(flags);
+	balloon_append(page, 1);
+	totalram_pages = --bs.current_pages - totalram_bias;
+	bs.driver_pages--;
+	balloon_unlock(flags);
+
+	schedule_work(&balloon_worker);
+}
+EXPORT_SYMBOL_GPL(balloon_release_driver_page);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/balloon/common.h b/drivers/xen/balloon/common.h
new file mode 100644
index 0000000..0a53f7a
--- /dev/null
+++ b/drivers/xen/balloon/common.h
@@ -0,0 +1,57 @@
+/******************************************************************************
+ * balloon/common.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BALLOON_COMMON_H__
+#define __XEN_BALLOON_COMMON_H__
+
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+
+struct balloon_stats {
+	/* We aim for 'current allocation' == 'target allocation'. */
+	unsigned long current_pages;
+	unsigned long target_pages;
+	/*
+	 * Drivers may alter the memory reservation independently, but they
+	 * must inform the balloon driver so we avoid hitting the hard limit.
+	 */
+	unsigned long driver_pages;
+	/* Number of pages in high- and low-memory balloons. */
+	unsigned long balloon_low;
+	unsigned long balloon_high;
+};
+
+extern struct balloon_stats balloon_stats;
+#define bs balloon_stats
+
+int balloon_sysfs_init(void);
+void balloon_sysfs_exit(void);
+
+void balloon_set_new_target(unsigned long target);
+unsigned long balloon_minimum_target(void);
+
+#endif /* __XEN_BALLOON_COMMON_H__ */
diff --git a/drivers/xen/balloon/sysfs.c b/drivers/xen/balloon/sysfs.c
new file mode 100644
index 0000000..d49dfca
--- /dev/null
+++ b/drivers/xen/balloon/sysfs.c
@@ -0,0 +1,209 @@
+/******************************************************************************
+ * balloon/sysfs.c
+ *
+ * Xen balloon driver - sysfs interfaces.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <xen/balloon.h>
+#include "common.h"
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define BALLOON_CLASS_NAME "xen_memory"
+
+#define BALLOON_SHOW(name, format, args...)			\
+	static ssize_t show_##name(struct device *dev,		\
+				   struct device_attribute *attr, \
+				   char *buf)			\
+	{							\
+		return sprintf(buf, format, ##args);		\
+	}							\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages));
+BALLOON_SHOW(min_kb, "%lu\n", PAGES2KB(balloon_minimum_target()));
+BALLOON_SHOW(max_kb, "%lu\n", PAGES2KB(num_physpages));
+BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low));
+BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
+BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
+
+static ssize_t show_target_kb(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
+}
+
+static ssize_t store_target_kb(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	char *endchar;
+	unsigned long long target_bytes;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	
+	if (count <= 1)
+		return -EBADMSG; /* runt */
+	
+	target_bytes = simple_strtoull(buf, &endchar, 0) << 10;
+	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+	
+	return count;
+}
+
+static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR,
+		   show_target_kb, store_target_kb);
+
+static ssize_t show_target(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%llu\n",
+		       (unsigned long long)balloon_stats.target_pages
+		       << PAGE_SHIFT);
+}
+
+static ssize_t store_target(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf,
+			    size_t count)
+{
+	char *endchar;
+	unsigned long long target_bytes;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (count <= 1)
+		return -EBADMSG; /* runt */
+
+	target_bytes = memparse(buf, &endchar);
+	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+	return count;
+}
+
+static DEVICE_ATTR(target, S_IRUGO | S_IWUSR,
+		   show_target, store_target);
+
+static struct device_attribute *balloon_attrs[] = {
+	&dev_attr_target_kb,
+	&dev_attr_target,
+};
+
+static struct attribute *balloon_info_attrs[] = {
+	&dev_attr_current_kb.attr,
+	&dev_attr_min_kb.attr,
+	&dev_attr_max_kb.attr,
+	&dev_attr_low_kb.attr,
+	&dev_attr_high_kb.attr,
+	&dev_attr_driver_kb.attr,
+	NULL
+};
+
+static const struct attribute_group balloon_info_group = {
+	.name = "info",
+	.attrs = balloon_info_attrs,
+};
+
+static struct bus_type balloon_subsys = {
+	.name = BALLOON_CLASS_NAME,
+	.dev_name = BALLOON_CLASS_NAME,
+};
+
+static struct device balloon_dev;
+
+static int __init register_balloon(struct device *dev)
+{
+	int i, error;
+
+	error = subsys_system_register(&balloon_subsys, NULL);
+	if (error)
+		return error;
+
+	dev->id = 0;
+	dev->bus = &balloon_subsys;
+
+	error = device_register(dev);
+	if (error) {
+		bus_unregister(&balloon_subsys);
+		return error;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
+		error = device_create_file(dev, balloon_attrs[i]);
+		if (error)
+			goto fail;
+	}
+
+	error = sysfs_create_group(&dev->kobj, &balloon_info_group);
+	if (error)
+		goto fail;
+	
+	return 0;
+
+ fail:
+	while (--i >= 0)
+		device_remove_file(dev, balloon_attrs[i]);
+	device_unregister(dev);
+	bus_unregister(&balloon_subsys);
+	return error;
+}
+
+static __exit void unregister_balloon(struct device *dev)
+{
+	int i;
+
+	sysfs_remove_group(&dev->kobj, &balloon_info_group);
+	for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
+		device_remove_file(dev, balloon_attrs[i]);
+	device_unregister(dev);
+	bus_unregister(&balloon_subsys);
+}
+
+int __init balloon_sysfs_init(void)
+{
+	int rc = register_balloon(&balloon_dev);
+
+	register_xen_selfballooning(&balloon_dev);
+
+	return rc;
+}
+
+void __exit balloon_sysfs_exit(void)
+{
+	unregister_balloon(&balloon_dev);
+}
diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
new file mode 100644
index 0000000..599afe4
--- /dev/null
+++ b/drivers/xen/blkback/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
+obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o
+
+blkbk-y	:= blkback.o xenbus.o interface.o vbd.o cdrom.o
diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c
new file mode 100644
index 0000000..3a52eadd
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.c
@@ -0,0 +1,97 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "blkback-pagemap.h"
+
+static int blkback_pagemap_size;
+static struct blkback_pagemap *blkback_pagemap;
+
+static inline int
+blkback_pagemap_entry_clear(struct blkback_pagemap *map)
+{
+	static struct blkback_pagemap zero;
+	return !memcmp(map, &zero, sizeof(zero));
+}
+
+int
+blkback_pagemap_init(int pages)
+{
+	blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
+				  GFP_KERNEL);
+	if (!blkback_pagemap)
+		return -ENOMEM;
+
+	blkback_pagemap_size = pages;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_init);
+
+void
+blkback_pagemap_set(int idx, struct page *page,
+		    domid_t domid, busid_t busid, grant_ref_t gref)
+{
+	struct blkback_pagemap *entry;
+
+	BUG_ON(!blkback_pagemap);
+	BUG_ON(idx >= blkback_pagemap_size);
+
+	SetPageBlkback(page);
+	set_page_private(page, idx);
+
+	entry = blkback_pagemap + idx;
+	if (!blkback_pagemap_entry_clear(entry)) {
+		pr_emerg("overwriting pagemap %d: d %u b %u g %u\n",
+			 idx, entry->domid, entry->busid, entry->gref);
+		BUG();
+	}
+
+	entry->domid = domid;
+	entry->busid = busid;
+	entry->gref  = gref;
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_set);
+
+void
+blkback_pagemap_clear(struct page *page)
+{
+	int idx;
+	struct blkback_pagemap *entry;
+
+	idx = (int)page_private(page);
+
+	BUG_ON(!blkback_pagemap);
+	BUG_ON(!PageBlkback(page));
+	BUG_ON(idx >= blkback_pagemap_size);
+
+	entry = blkback_pagemap + idx;
+	if (blkback_pagemap_entry_clear(entry)) {
+		pr_emerg("clearing empty pagemap %d\n", idx);
+		BUG();
+	}
+
+	memset(entry, 0, sizeof(*entry));
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_clear);
+
+struct blkback_pagemap
+blkback_pagemap_read(struct page *page)
+{
+	int idx;
+	struct blkback_pagemap *entry;
+
+	idx = (int)page_private(page);
+
+	BUG_ON(!blkback_pagemap);
+	BUG_ON(!PageBlkback(page));
+	BUG_ON(idx >= blkback_pagemap_size);
+
+	entry = blkback_pagemap + idx;
+	if (blkback_pagemap_entry_clear(entry)) {
+		pr_emerg("reading empty pagemap %d\n", idx);
+		BUG();
+	}
+
+	return *entry;
+}
+EXPORT_SYMBOL(blkback_pagemap_read);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h
new file mode 100644
index 0000000..0becf22
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.h
@@ -0,0 +1,37 @@
+#ifndef _BLKBACK_PAGEMAP_H_
+#define _BLKBACK_PAGEMAP_H_
+
+#include <linux/mm.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/grant_table.h>
+
+typedef unsigned int busid_t;
+
+struct blkback_pagemap {
+	domid_t          domid;
+	busid_t          busid;
+	grant_ref_t      gref;
+};
+
+#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE)
+
+int blkback_pagemap_init(int);
+void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
+void blkback_pagemap_clear(struct page *);
+struct blkback_pagemap blkback_pagemap_read(struct page *);
+
+#else /* CONFIG_XEN_BLKBACK_PAGEMAP */
+
+static inline int blkback_pagemap_init(int pages) { return 0; }
+static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom,
+				       busid_t bus, grant_ref_t gnt) {}
+static inline void blkback_pagemap_clear(struct page *page) {}
+static inline struct blkback_pagemap blkback_pagemap_read(struct page *page)
+{
+	BUG();
+	return (struct blkback_pagemap){-1, -1, -1};
+}
+
+#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */
+
+#endif
diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
new file mode 100644
index 0000000..c4f5656
--- /dev/null
+++ b/drivers/xen/blkback/blkback.c
@@ -0,0 +1,760 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ * 
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/blkif/frontend
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ * 
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ * 
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements 
+ * the pendcnt towards zero. When it hits zero, the specified domain has a 
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+	blkif_t       *blkif;
+	u64            id;
+	atomic_t       pendcnt;
+	unsigned short nr_pages;
+	unsigned short operation;
+	struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+	unsigned long pfn = page_to_pfn(pending_page(req, seg));
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+			  unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+	if (blkif->plug == NULL)
+		return;
+	kobject_put(&blkif->plug->kobj);
+	blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q == blkif->plug)
+		return;
+	unplug_queue(blkif);
+	WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
+	kobject_get(&q->kobj);
+	blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int i, invcount = 0;
+	grant_handle_t handle;
+	int ret;
+
+	for (i = 0; i < req->nr_pages; i++) {
+		handle = pending_handle(req, i);
+		if (handle == BLKBACK_INVALID_HANDLE)
+			continue;
+		blkback_pagemap_clear(pending_page(req, i));
+		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+				    GNTMAP_host_map, handle);
+		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+		invcount++;
+	}
+
+	ret = HYPERVISOR_grant_table_op(
+		GNTTABOP_unmap_grant_ref, unmap, invcount);
+	BUG_ON(ret);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d"
+	       "  |  fl %4d  |  ds %4d  |  pk %4d\n",
+	       current->comm, blkif->st_oo_req,
+	       blkif->st_rd_req, blkif->st_wr_req,
+	       blkif->st_br_req, blkif->st_fl_req,
+	       blkif->st_ds_req, blkif->st_pk_req);
+	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	blkif->st_rd_req = 0;
+	blkif->st_wr_req = 0;
+	blkif->st_oo_req = 0;
+	blkif->st_br_req = 0;
+	blkif->st_fl_req = 0;
+	blkif->st_ds_req = 0;
+	blkif->st_pk_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+	blkif_t *blkif = arg;
+	struct vbd *vbd = &blkif->vbd;
+
+	blkif_get(blkif);
+
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: started\n", current->comm);
+
+	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+		if (unlikely(vbd->size != vbd_size(vbd)))
+			vbd_resize(blkif);
+
+		wait_event_interruptible(
+			blkif->wq,
+			blkif->waiting_reqs || kthread_should_stop());
+		wait_event_interruptible(
+			pending_free_wq,
+			!list_empty(&pending_free) || kthread_should_stop());
+
+		blkif->waiting_reqs = 0;
+		smp_mb(); /* clear flag *before* checking for work */
+
+		if (do_block_io_op(blkif))
+			blkif->waiting_reqs = 1;
+		unplug_queue(blkif);
+
+		if (log_stats && time_after(jiffies, blkif->st_print))
+			print_stats(blkif);
+	}
+
+	if (log_stats)
+		print_stats(blkif);
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+	blkif->xenblkd = NULL;
+	blkif_put(blkif);
+
+	return 0;
+}
+
+static void do_discard(blkif_t *blkif, struct blkif_request_discard *req)
+{
+	int err = -EOPNOTSUPP;
+	int status = BLKIF_RSP_OKAY;
+	struct block_device *bdev = blkif->vbd.bdev;
+
+	if (blkif->blk_backend_type == BLKIF_BACKEND_PHY ||
+	    blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
+		unsigned long secure = (blkif->vbd.discard_secure &&
+			(req->flag & BLKIF_DISCARD_SECURE)) ?
+			BLKDEV_DISCARD_SECURE : 0;
+
+		err = blkdev_issue_discard(bdev, req->sector_number,
+					   req->nr_sectors, GFP_KERNEL,
+					   secure);
+	}
+
+	if (err == -EOPNOTSUPP) {
+		DPRINTK("discard op failed, not supported\n");
+		status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (err)
+		status = BLKIF_RSP_ERROR;
+
+	make_response(blkif, req->id, req->operation, status);
+}
+
+static void drain_io(blkif_t *blkif)
+{
+	atomic_set(&blkif->drain, 1);
+	do {
+		/* The initial value is one, and one refcnt taken at the
+		 * start of the blkif_schedule thread. */
+		if (atomic_read(&blkif->refcnt) <= 2)
+			break;
+
+		wait_for_completion_interruptible_timeout(
+				&blkif->drain_complete, HZ);
+
+		if (!atomic_read(&blkif->drain))
+			break;
+	} while (!kthread_should_stop());
+	atomic_set(&blkif->drain, 0);
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int error)
+{
+	blkif_t *blkif = pending_req->blkif;
+	int status = BLKIF_RSP_OKAY;
+
+	/* An error fails the entire request. */
+	if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+	    (error == -EOPNOTSUPP)) {
+		DPRINTK("blkback: write barrier op failed, not supported\n");
+		blkback_barrier(XBT_NIL, blkif->be, 0);
+		status = BLKIF_RSP_EOPNOTSUPP;
+	} else if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
+		   (error == -EOPNOTSUPP)) {
+		DPRINTK("blkback: flush diskcache op failed, not supported\n");
+		blkback_flush_diskcache(XBT_NIL, blkif->be, 0);
+		status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (error) {
+		DPRINTK("Buffer not up-to-date at end of operation, "
+			"error=%d\n", error);
+		status = BLKIF_RSP_ERROR;
+	}
+
+	if (atomic_dec_and_test(&pending_req->pendcnt)) {
+		fast_flush_area(pending_req);
+		make_response(blkif, pending_req->id,
+			      pending_req->operation, status);
+		blkif_put(blkif);
+		free_req(pending_req);
+		if (atomic_read(&blkif->drain)
+		    && atomic_read(&blkif->refcnt) <= 2)
+			complete(&blkif->drain_complete);
+	}
+}
+
+static void end_block_io_op(struct bio *bio, int error)
+{
+	__end_block_io_op(bio->bi_private, error);
+	bio_put(bio);
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+	blkif->waiting_reqs = 1;
+	wake_up(&blkif->wq);
+}
+
+irqreturn_t blkif_be_int(int irq, void *dev_id)
+{
+	blkif_notify_work(dev_id);
+	return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static int _do_block_io_op(blkif_t *blkif)
+{
+	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+	blkif_request_t req;
+	pending_req_t *pending_req;
+	RING_IDX rc, rp;
+	int more_to_do = 0;
+
+	rc = blk_rings->common.req_cons;
+	rp = blk_rings->common.sring->req_prod;
+	rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+	while ((rc != rp)) {
+
+		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+			break;
+
+		if (kthread_should_stop()) {
+			more_to_do = 1;
+			break;
+		}
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
+			more_to_do = 1;
+			break;
+		}
+
+		switch (blkif->blk_protocol) {
+		case BLKIF_PROTOCOL_NATIVE:
+			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+			break;
+		case BLKIF_PROTOCOL_X86_32:
+			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+			break;
+		case BLKIF_PROTOCOL_X86_64:
+			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+			break;
+		default:
+			BUG();
+		}
+		blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+		/* Apply all sanity checks to /private copy/ of request. */
+		barrier();
+
+		switch (req.operation) {
+		case BLKIF_OP_READ:
+		case BLKIF_OP_WRITE:
+		case BLKIF_OP_WRITE_BARRIER:
+		case BLKIF_OP_FLUSH_DISKCACHE:
+		case BLKIF_OP_DISCARD:
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+		case BLKIF_OP_PACKET:
+			DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n");
+			blkif->st_pk_req++;
+			make_response(blkif, req.id, req.operation,
+				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
+			break;
+		default:
+			/* A good sign something is wrong: sleep for a while to
+			 * avoid excessive CPU consumption by a bad guest. */
+			msleep(1);
+			DPRINTK("error: unknown block io operation [%d]\n",
+				req.operation);
+			make_response(blkif, req.id, req.operation,
+				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
+			break;
+		}
+
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+	}
+
+	return more_to_do;
+}
+
+static int
+do_block_io_op(blkif_t *blkif)
+{
+	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+	int more_to_do;
+
+	do {
+		more_to_do = _do_block_io_op(blkif);
+		if (more_to_do)
+			break;
+
+		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+	} while (more_to_do);
+
+	return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req)
+{
+	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct phys_req preq;
+	struct { 
+		unsigned long buf; unsigned int nsec;
+	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int nseg = req->nr_segments;
+	struct bio *bio = NULL;
+	uint32_t flags;
+	int ret, i;
+	int operation;
+
+	switch (req->operation) {
+	case BLKIF_OP_READ:
+		blkif->st_rd_req++;
+		operation = READ;
+		break;
+	case BLKIF_OP_WRITE:
+		blkif->st_wr_req++;
+		operation = WRITE;
+		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		blkif->st_br_req++;
+		operation = WRITE_FLUSH_FUA;
+		break;
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		blkif->st_fl_req++;
+		operation = WRITE_FLUSH;
+		break;
+	case BLKIF_OP_DISCARD:
+		blkif->st_ds_req++;
+		operation = REQ_DISCARD;
+		nseg = 0;
+		break;
+	default:
+		operation = 0; /* make gcc happy */
+		BUG();
+	}
+
+	/* Check that number of segments is sane. */
+	if (unlikely(nseg == 0 && !(operation & (REQ_FLUSH|REQ_DISCARD))) ||
+	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+		DPRINTK("Bad number of segments in request (%d)\n", nseg);
+		goto fail_response;
+	}
+
+	preq.dev           = req->handle;
+	preq.sector_number = req->sector_number;
+	preq.nr_sects      = 0;
+
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->id;
+	pending_req->operation = req->operation;
+	pending_req->nr_pages  = nseg;
+
+	flags = GNTMAP_host_map;
+	if (operation != READ)
+		flags |= GNTMAP_readonly;
+
+	for (i = 0; i < nseg; i++) {
+		seg[i].nsec = req->seg[i].last_sect -
+			req->seg[i].first_sect + 1;
+
+		if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+		    (req->seg[i].last_sect < req->seg[i].first_sect))
+			goto fail_response;
+		preq.nr_sects += seg[i].nsec;
+
+		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+				  req->seg[i].gref, blkif->domid);
+	}
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+	BUG_ON(ret);
+
+	for (i = 0; i < nseg; i++) {
+		if (unlikely(map[i].status == GNTST_eagain))
+			gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map[i])
+		if (unlikely(map[i].status != GNTST_okay)) {
+			DPRINTK("invalid buffer -- could not remap it\n");
+			map[i].handle = BLKBACK_INVALID_HANDLE;
+			ret = 1;
+		} else {
+			blkback_pagemap_set(vaddr_pagenr(pending_req, i),
+					    pending_page(pending_req, i),
+					    blkif->domid, req->handle,
+					    req->seg[i].gref);
+		}
+
+		pending_handle(pending_req, i) = map[i].handle;
+
+		if (ret)
+			continue;
+
+		set_phys_to_machine(
+			page_to_pfn(pending_page(pending_req, i)),
+			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+		seg[i].buf  = map[i].dev_bus_addr | 
+			(req->seg[i].first_sect << 9);
+	}
+
+	if (ret)
+		goto fail_flush;
+
+	if (vbd_translate(&preq, blkif, operation) != 0) {
+		DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
+			operation == READ ? "read" : "write",
+			preq.sector_number,
+			preq.sector_number + preq.nr_sects, preq.dev);
+		goto fail_flush;
+	}
+
+	/* Wait on all outstanding I/O's and once that has been completed
+	 * issue the WRITE_FLUSH.
+	 */
+	if (req->operation == BLKIF_OP_WRITE_BARRIER)
+		drain_io(blkif);
+
+	plug_queue(blkif, preq.bdev);
+	atomic_set(&pending_req->pendcnt, 1);
+	blkif_get(blkif);
+
+	for (i = 0; i < nseg; i++) {
+		if (((int)preq.sector_number|(int)seg[i].nsec) &
+		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+			DPRINTK("Misaligned I/O request from domain %d",
+				blkif->domid);
+			goto fail_put_bio;
+		}
+
+		while ((bio == NULL) ||
+		       (bio_add_page(bio,
+				     pending_page(pending_req, i),
+				     seg[i].nsec << 9,
+				     seg[i].buf & ~PAGE_MASK) == 0)) {
+			if (bio) {
+				atomic_inc(&pending_req->pendcnt);
+				submit_bio(operation, bio);
+			}
+
+			bio = bio_alloc(GFP_KERNEL, nseg-i);
+			if (unlikely(bio == NULL))
+				goto fail_put_bio;
+
+			bio->bi_bdev    = preq.bdev;
+			bio->bi_private = pending_req;
+			bio->bi_end_io  = end_block_io_op;
+			bio->bi_sector  = preq.sector_number;
+		}
+
+		preq.sector_number += seg[i].nsec;
+	}
+
+	if (!bio) {
+		if (operation == REQ_DISCARD) {
+			do_discard(blkif, (void *)req);
+			blkif_put(blkif);
+			free_req(pending_req);
+			return;
+		}
+
+		BUG_ON(!(operation & (REQ_FLUSH|REQ_FUA)));
+		bio = bio_alloc(GFP_KERNEL, 0);
+		if (unlikely(bio == NULL))
+			goto fail_put_bio;
+
+		bio->bi_bdev    = preq.bdev;
+		bio->bi_private = pending_req;
+		bio->bi_end_io  = end_block_io_op;
+		bio->bi_sector  = -1;
+	}
+
+	submit_bio(operation, bio);
+
+	if (operation == READ)
+		blkif->st_rd_sect += preq.nr_sects;
+	else
+		blkif->st_wr_sect += preq.nr_sects;
+
+	return;
+
+ fail_flush:
+	fast_flush_area(pending_req);
+ fail_response:
+	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	free_req(pending_req);
+	msleep(1); /* back off a bit */
+	return;
+
+ fail_put_bio:
+	__end_block_io_op(pending_req, -EINVAL);
+	if (bio)
+		bio_put(bio);
+	unplug_queue(blkif);
+	msleep(1); /* back off a bit */
+	return;
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+			  unsigned short op, int st)
+{
+	blkif_response_t  resp;
+	unsigned long     flags;
+	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+	int notify;
+
+	resp.id        = id;
+	resp.operation = op;
+	resp.status    = st;
+
+	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+	/* Place on the response ring for the relevant domain. */
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_32:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_64:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	default:
+		BUG();
+	}
+	blk_rings->common.rsp_prod_pvt++;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+	if (notify)
+		notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+	int i, mmap_pages;
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	pending_reqs          = kzalloc(sizeof(pending_reqs[0]) *
+					blkif_reqs, GFP_KERNEL);
+	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+					mmap_pages, GFP_KERNEL);
+	pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
+
+	if (blkback_pagemap_init(mmap_pages))
+		goto out_of_memory;
+
+	if (!pending_reqs || !pending_grant_handles || !pending_pages)
+		goto out_of_memory;
+
+	for (i = 0; i < mmap_pages; i++)
+		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+
+	blkif_interface_init();
+
+	INIT_LIST_HEAD(&pending_free);
+
+	for (i = 0; i < blkif_reqs; i++)
+		list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+	blkif_xenbus_init();
+
+	return 0;
+
+ out_of_memory:
+	kfree(pending_reqs);
+	kfree(pending_grant_handles);
+	free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+	pr_warning("%s: out of memory\n", __FUNCTION__);
+	return -ENOMEM;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vbd");
diff --git a/drivers/xen/blkback/cdrom.c b/drivers/xen/blkback/cdrom.c
new file mode 100644
index 0000000..fbbb9a5
--- /dev/null
+++ b/drivers/xen/blkback/cdrom.c
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * blkback/cdrom.c
+ *
+ * Routines for managing cdrom watch and media-present attribute of a
+ * cdrom type virtual block device (VBD).
+ *
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ * Copyright (c) 2007       Pat Campbell
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define MEDIA_PRESENT "media-present"
+
+static void cdrom_media_changed(struct xenbus_watch *, const char **, unsigned int);
+
+/**
+ * Writes media-present=1 attribute for the given vbd device if not
+ * already there
+ */
+static int cdrom_xenstore_write_media_present(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	struct xenbus_transaction xbt;
+	int err;
+	int media_present;
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, MEDIA_PRESENT, "%d",
+			   &media_present);
+	if (0 < err) {
+		DPRINTK("already written err%d", err);
+		return(0);
+	}
+	media_present = !!be->blkif->vbd.bdev;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		return(-1);
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, MEDIA_PRESENT, "%d", media_present );
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/%s",
+			 dev->nodename, MEDIA_PRESENT);
+		goto abort;
+	}
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		xenbus_dev_fatal(dev, err, "ending transaction");
+	return 0;
+ abort:
+	xenbus_transaction_end(xbt, 1);
+	return -1;
+}
+
+/**
+ *
+ */
+static int cdrom_is_type(struct backend_info *be)
+{
+	DPRINTK("type:%x", be->blkif->vbd.type );
+	return (be->blkif->vbd.type & VDISK_CDROM)
+	       && (be->blkif->vbd.type & GENHD_FL_REMOVABLE);
+}
+
+/**
+ *
+ */
+void cdrom_add_media_watch(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	DPRINTK("nodename:%s", dev->nodename);
+	if (cdrom_is_type(be)) {
+		DPRINTK("is a cdrom");
+		if (cdrom_xenstore_write_media_present(be) == 0) {
+			DPRINTK("xenstore wrote OK");
+			err = xenbus_watch_path2(dev, dev->nodename, MEDIA_PRESENT,
+						 &be->cdrom_watch,
+						 cdrom_media_changed);
+			if (err)
+				DPRINTK(MEDIA_PRESENT " watch add failed");
+		}
+	}
+}
+
+/**
+ * Callback received when the MEDIA_PRESENT xenstore node is changed
+ */
+static void cdrom_media_changed(struct xenbus_watch *watch,
+				const char **vec, unsigned int len)
+{
+	int err, media_present;
+	struct backend_info *be
+		= container_of(watch, struct backend_info, cdrom_watch);
+	struct xenbus_device *dev = be->dev;
+
+	if (!cdrom_is_type(be)) {
+		DPRINTK("callback not for a cdrom" );
+		return;
+	}
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, MEDIA_PRESENT, "%d",
+			   &media_present);
+	if (err <= 0) {
+		DPRINTK("read of " MEDIA_PRESENT " node error:%d", err);
+		return;
+	}
+
+	if (!media_present)
+		vbd_free(&be->blkif->vbd);
+	else if (!be->blkif->vbd.bdev) {
+		char *p = strrchr(dev->otherend, '/') + 1;
+		long handle = simple_strtoul(p, NULL, 0);
+
+		err = vbd_create(be->blkif, handle, be->major, be->minor,
+				 be->blkif->vbd.mode, true);
+		if (err && err != -ENOMEDIUM) {
+			be->major = be->minor = 0;
+			xenbus_dev_fatal(dev, err, "creating vbd structure");
+			return;
+		}
+		vbd_resize(be->blkif);
+	}
+}
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
new file mode 100644
index 0000000..1146770
--- /dev/null
+++ b/drivers/xen/blkback/common.h
@@ -0,0 +1,161 @@
+/* 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/wait.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+#include "blkback-pagemap.h"
+
+
+#define DPRINTK(_f, _a...)			\
+	pr_debug("(file=%s, line=%d) " _f,	\
+		 __FILE__ , __LINE__ , ## _a )
+
+enum blkif_backend_type {
+	BLKIF_BACKEND_PHY  = 1,
+	BLKIF_BACKEND_FILE = 2,
+};
+
+struct vbd {
+	blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
+	fmode_t        mode;        /* FMODE_xxx */
+	unsigned char  type;        /* VDISK_xxx */
+	bool           flush_support;
+	bool           discard_secure;
+	u32            pdevice;     /* phys device that this vbd maps to */
+	struct block_device *bdev;
+	sector_t       size;        /* Cached size parameter */
+};
+
+struct backend_info;
+
+typedef struct blkif_st {
+	/* Unique identifier for this interface. */
+	domid_t           domid;
+	unsigned int      handle;
+	/* Physical parameters of the comms window. */
+	unsigned int      irq;
+	/* Comms information. */
+	enum blkif_protocol blk_protocol;
+	enum blkif_backend_type blk_backend_type;
+	blkif_back_rings_t blk_rings;
+	struct vm_struct *blk_ring_area;
+	/* The VBD attached to this interface. */
+	struct vbd        vbd;
+	/* Back pointer to the backend_info. */
+	struct backend_info *be;
+	/* Private fields. */
+	spinlock_t       blk_ring_lock;
+	atomic_t         refcnt;
+
+	wait_queue_head_t   wq;
+	/* for barrier (drain) requests */
+	struct completion   drain_complete;
+	atomic_t            drain;
+	struct task_struct  *xenblkd;
+	unsigned int        waiting_reqs;
+	struct request_queue *plug;
+
+	/* statistics */
+	unsigned long       st_print;
+	int                 st_rd_req;
+	int                 st_wr_req;
+	int                 st_oo_req;
+	int                 st_br_req;
+	int                 st_fl_req;
+	int                 st_ds_req;
+	int                 st_pk_req;
+	int                 st_rd_sect;
+	int                 st_wr_sect;
+
+	wait_queue_head_t waiting_to_free;
+} blkif_t;
+
+struct backend_info
+{
+	struct xenbus_device *dev;
+	blkif_t *blkif;
+	struct xenbus_watch backend_watch;
+	struct xenbus_watch cdrom_watch;
+	unsigned major;
+	unsigned minor;
+	char *mode;
+};
+
+blkif_t *blkif_alloc(domid_t domid);
+void blkif_disconnect(blkif_t *blkif);
+void blkif_free(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, grant_ref_t, evtchn_port_t);
+void vbd_resize(blkif_t *blkif);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)					\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->refcnt))	\
+			wake_up(&(_b)->waiting_to_free);\
+	} while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
+	       unsigned minor, fmode_t mode, bool cdrom);
+void vbd_free(struct vbd *vbd);
+
+unsigned long long vbd_size(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+	unsigned short       dev;
+	blkif_sector_t       nr_sects;
+	struct block_device *bdev;
+	blkif_sector_t       sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+
+void blkif_interface_init(void);
+
+void blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id);
+int blkif_schedule(void *arg);
+
+int blkback_barrier(struct xenbus_transaction xbt,
+		    struct backend_info *be, int state);
+int blkback_flush_diskcache(struct xenbus_transaction,
+			    struct backend_info *, int state);
+
+/* cdrom media change */
+void cdrom_add_media_watch(struct backend_info *be);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
new file mode 100644
index 0000000..304e857
--- /dev/null
+++ b/drivers/xen/blkback/interface.c
@@ -0,0 +1,146 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ * 
+ * Block-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *blkif_alloc(domid_t domid)
+{
+	blkif_t *blkif;
+
+	blkif = kmem_cache_zalloc(blkif_cachep, GFP_KERNEL);
+	if (!blkif)
+		return ERR_PTR(-ENOMEM);
+
+	blkif->domid = domid;
+	spin_lock_init(&blkif->blk_ring_lock);
+	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	init_completion(&blkif->drain_complete);
+	atomic_set(&blkif->drain, 0);
+	blkif->st_print = jiffies;
+	init_waitqueue_head(&blkif->waiting_to_free);
+
+	return blkif;
+}
+
+int blkif_map(blkif_t *blkif, grant_ref_t ring_ref, evtchn_port_t evtchn)
+{
+	struct vm_struct *area;
+	int err;
+
+	/* Already connected through? */
+	if (blkif->irq)
+		return 0;
+
+	area = xenbus_map_ring_valloc(blkif->be->dev, ring_ref);
+	if (IS_ERR(area))
+		return PTR_ERR(area);
+	blkif->blk_ring_area = area;
+
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+	{
+		blkif_sring_t *sring;
+		sring = (blkif_sring_t *)area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_32:
+	{
+		blkif_x86_32_sring_t *sring_x86_32;
+		sring_x86_32 = (blkif_x86_32_sring_t *)area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_64:
+	{
+		blkif_x86_64_sring_t *sring_x86_64;
+		sring_x86_64 = (blkif_x86_64_sring_t *)area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+	if (err < 0)
+	{
+		xenbus_unmap_ring_vfree(blkif->be->dev, area);
+		blkif->blk_rings.common.sring = NULL;
+		return err;
+	}
+	blkif->irq = err;
+
+	return 0;
+}
+
+void blkif_disconnect(blkif_t *blkif)
+{
+	if (blkif->xenblkd) {
+		kthread_stop(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+	}
+
+	atomic_dec(&blkif->refcnt);
+	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+	atomic_inc(&blkif->refcnt);
+
+	if (blkif->irq) {
+		unbind_from_irqhandler(blkif->irq, blkif);
+		blkif->irq = 0;
+	}
+
+	if (blkif->blk_rings.common.sring) {
+		xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+	}
+}
+
+void blkif_free(blkif_t *blkif)
+{
+	if (!atomic_dec_and_test(&blkif->refcnt))
+		BUG();
+	kmem_cache_free(blkif_cachep, blkif);
+}
+
+void __init blkif_interface_init(void)
+{
+	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
+					 0, 0, NULL);
+}
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
new file mode 100644
index 0000000..66fb8d1
--- /dev/null
+++ b/drivers/xen/blkback/vbd.c
@@ -0,0 +1,212 @@
+/******************************************************************************
+ * blkback/vbd.c
+ * 
+ * Routines for managing virtual block devices (VBDs).
+ * 
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
+	(_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
+
+unsigned long long vbd_size(struct vbd *vbd)
+{
+	return vbd->bdev ? vbd_sz(vbd) : 0;
+}
+
+unsigned long vbd_secsize(struct vbd *vbd)
+{
+	return vbd->bdev ? bdev_logical_block_size(vbd->bdev) : 0;
+}
+
+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
+	       unsigned minor, fmode_t mode, bool cdrom)
+{
+	struct vbd *vbd;
+	struct block_device *bdev;
+	struct request_queue *q;
+
+	vbd = &blkif->vbd;
+	vbd->handle   = handle; 
+	vbd->size     = 0;
+	vbd->type     = cdrom ? VDISK_CDROM : 0;
+
+	if (!(mode & FMODE_WRITE)) {
+		mode &= ~FMODE_EXCL; /* xend doesn't even allow mode="r!" */
+		vbd->type |= VDISK_READONLY;
+	}
+	vbd->mode = mode;
+
+	vbd->pdevice  = MKDEV(major, minor);
+
+	bdev = blkdev_get_by_dev(vbd->pdevice, mode, blkif);
+
+	if (IS_ERR(bdev)) {
+		if (PTR_ERR(bdev) != -ENOMEDIUM) {
+			DPRINTK("vbd_creat: device %08x could not be opened\n",
+				vbd->pdevice);
+			return -ENOENT;
+		}
+
+		DPRINTK("vbd_creat: device %08x has no medium\n",
+			vbd->pdevice);
+		if (cdrom)
+			return -ENOMEDIUM;
+
+		bdev = blkdev_get_by_dev(vbd->pdevice, mode | FMODE_NDELAY,
+					 blkif);
+		if (IS_ERR(bdev))
+			return -ENOMEDIUM;
+
+		if (bdev->bd_disk) {
+			if (bdev->bd_disk->flags & GENHD_FL_CD)
+				vbd->type |= VDISK_CDROM;
+			if (bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+				vbd->type |= VDISK_REMOVABLE;
+		}
+
+		blkdev_put(bdev, mode);
+		return -ENOMEDIUM;
+	}
+
+	vbd->bdev = bdev;
+
+	if (vbd->bdev->bd_disk == NULL) {
+		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+			vbd->pdevice);
+		vbd_free(vbd);
+		return -ENOENT;
+	}
+
+	vbd->size = vbd_size(vbd);
+
+	if (bdev->bd_disk->flags & GENHD_FL_CD)
+		vbd->type |= VDISK_CDROM;
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+		vbd->type |= VDISK_REMOVABLE;
+
+	q = bdev_get_queue(bdev);
+	if (q && q->flush_flags)
+		vbd->flush_support = true;
+
+	if (q && blk_queue_secdiscard(q))
+		vbd->discard_secure = true;
+
+	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+		handle, blkif->domid);
+	return 0;
+}
+
+void vbd_free(struct vbd *vbd)
+{
+	if (vbd->bdev)
+		blkdev_put(vbd->bdev, vbd->mode);
+	vbd->bdev = NULL;
+}
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
+{
+	struct vbd *vbd = &blkif->vbd;
+	int rc = -EACCES;
+
+	if ((operation != READ) && !(vbd->mode & FMODE_WRITE))
+		goto out;
+
+	if (vbd->bdev == NULL) {
+		rc = -ENOMEDIUM;
+		goto out;
+	}
+
+	if (likely(req->nr_sects)) {
+		blkif_sector_t end = req->sector_number + req->nr_sects;
+
+		if (unlikely(end < req->sector_number))
+			goto out;
+		if (unlikely(end > vbd_sz(vbd)))
+			goto out;
+	}
+
+	req->dev  = vbd->pdevice;
+	req->bdev = vbd->bdev;
+	rc = 0;
+
+ out:
+	return rc;
+}
+
+void vbd_resize(blkif_t *blkif)
+{
+	struct vbd *vbd = &blkif->vbd;
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = blkif->be->dev;
+	unsigned long long new_size = vbd_size(vbd);
+
+	pr_info("VBD Resize: new size %Lu\n", new_size);
+	vbd->size = new_size;
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		pr_warning("Error %d starting transaction", err);
+		return;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
+			    vbd_size(vbd));
+	if (err) {
+		pr_warning("Error %d writing new size", err);
+		goto abort;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+			    vbd_secsize(vbd));
+	if (err) {
+		pr_warning("Error writing new sector size");
+		goto abort;
+	}
+
+	/*
+	 * Write the current state; we will use this to synchronize
+	 * the front-end. If the current state is "connected" the
+	 * front-end will get the new size information online.
+	 */
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+	if (err) {
+		pr_warning("Error %d writing the state", err);
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		pr_warning("Error %d ending transaction", err);
+	return;
+abort:
+	xenbus_transaction_end(xbt, 1);
+}
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
new file mode 100644
index 0000000..15e3d19
--- /dev/null
+++ b/drivers/xen/blkback/xenbus.c
@@ -0,0 +1,662 @@
+/*  Xenbus code for blkif backend
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+    Copyright (C) 2005 XenSource Ltd
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdarg.h>
+#include <linux/kthread.h>
+#include "common.h"
+#include "../core/domctl.h"
+
+#undef DPRINTK
+#define DPRINTK(fmt, args...)				\
+	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+			    unsigned int);
+
+static int blkback_name(blkif_t *blkif, char *buf)
+{
+	char *devpath, *devname;
+	struct xenbus_device *dev = blkif->be->dev;
+
+	devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+	if (IS_ERR(devpath)) 
+		return PTR_ERR(devpath);
+	
+	if ((devname = strstr(devpath, "/dev/")) != NULL)
+		devname += strlen("/dev/");
+	else
+		devname  = devpath;
+
+	snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
+	kfree(devpath);
+	
+	return 0;
+}
+
+static void update_blkif_status(blkif_t *blkif)
+{ 
+	int err;
+	char name[TASK_COMM_LEN];
+
+	/* Not ready to connect? */
+	if (!blkif->irq)
+		return;
+
+	/* Already connected? */
+	if (blkif->be->dev->state == XenbusStateConnected)
+		return;
+
+	/* Attempt to connect: exit if we fail to. */
+	connect(blkif->be);
+	if (blkif->be->dev->state != XenbusStateConnected)
+		return;
+
+	err = blkback_name(blkif, name);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+		return;
+	}
+
+	if (blkif->vbd.bdev) {
+		struct address_space *mapping
+			= blkif->vbd.bdev->bd_inode->i_mapping;
+
+		err = filemap_write_and_wait(mapping);
+		if (err) {
+			xenbus_dev_error(blkif->be->dev, err, "block flush");
+			return;
+		}
+		invalidate_inode_pages2(mapping);
+	}
+
+	blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
+	if (IS_ERR(blkif->xenblkd)) {
+		err = PTR_ERR(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+	}
+}
+
+
+/****************************************************************
+ *  sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		ssize_t ret = -ENODEV;					\
+		struct xenbus_device *dev;				\
+		struct backend_info *be;				\
+									\
+		if (!get_device(_dev))					\
+			return ret;					\
+		dev = to_xenbus_device(_dev);				\
+		if ((be = dev_get_drvdata(&dev->dev)) != NULL)		\
+			ret = sprintf(buf, format, ##args);		\
+		put_device(_dev);					\
+		return ret;						\
+	}								\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(br_req,  "%d\n", be->blkif->st_br_req);
+VBD_SHOW(fl_req,  "%d\n", be->blkif->st_fl_req);
+VBD_SHOW(ds_req,  "%d\n", be->blkif->st_ds_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *vbdstat_attrs[] = {
+	&dev_attr_oo_req.attr,
+	&dev_attr_rd_req.attr,
+	&dev_attr_wr_req.attr,
+	&dev_attr_br_req.attr,
+	&dev_attr_fl_req.attr,
+	&dev_attr_ds_req.attr,
+	&dev_attr_rd_sect.attr,
+	&dev_attr_wr_sect.attr,
+	NULL
+};
+
+static const struct attribute_group vbdstat_group = {
+	.name = "statistics",
+	.attrs = vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+	int error;
+	
+	error = device_create_file(&dev->dev, &dev_attr_physical_device);
+ 	if (error)
+		goto fail1;
+
+	error = device_create_file(&dev->dev, &dev_attr_mode);
+	if (error)
+		goto fail2;
+
+	error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
+	if (error)
+		goto fail3;
+
+	return 0;
+
+fail3:	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+fail2:	device_remove_file(&dev->dev, &dev_attr_mode);
+fail1:	device_remove_file(&dev->dev, &dev_attr_physical_device);
+	return error;
+}
+
+void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+	device_remove_file(&dev->dev, &dev_attr_mode);
+	device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("");
+
+	if (be->major || be->minor)
+		xenvbd_sysfs_delif(dev);
+
+	if (be->backend_watch.node) {
+		unregister_xenbus_watch(&be->backend_watch);
+		kfree(be->backend_watch.node);
+		be->backend_watch.node = NULL;
+	}
+
+	if (be->cdrom_watch.node) {
+		unregister_xenbus_watch(&be->cdrom_watch);
+		kfree(be->cdrom_watch.node);
+		be->cdrom_watch.node = NULL;
+	}
+
+	if (be->blkif) {
+		blkif_disconnect(be->blkif);
+		vbd_free(&be->blkif->vbd);
+		blkif_free(be->blkif);
+		be->blkif = NULL;
+	}
+
+	kfree(be);
+	dev_set_drvdata(&dev->dev, NULL);
+	return 0;
+}
+
+int blkback_barrier(struct xenbus_transaction xbt,
+		    struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+			    "%d", state);
+	if (err)
+		xenbus_dev_fatal(dev, err, "writing feature-barrier");
+
+	return err;
+}
+
+int blkback_flush_diskcache(struct xenbus_transaction xbt,
+			    struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
+			    "%d", state);
+	if (err)
+		xenbus_dev_fatal(dev, err, "writing feature-flush-cache");
+
+	return err;
+}
+
+static int xen_blkbk_discard(struct xenbus_transaction xbt,
+			     struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	blkif_t *blkif = be->blkif;
+	char *type;
+	int err;
+	int state = 0;
+
+	type = xenbus_read(XBT_NIL, dev->nodename, "type", NULL);
+	if (!IS_ERR(type)) {
+		if (strncmp(type, "file", 4) == 0) {
+			state = 1;
+			blkif->blk_backend_type = BLKIF_BACKEND_FILE;
+		}
+		if (strncmp(type, "phy", 3) == 0) {
+			struct request_queue *q;
+
+			q = bdev_get_queue(blkif->vbd.bdev);
+			if (blk_queue_discard(q)) {
+				err = xenbus_printf(xbt, dev->nodename,
+					"discard-granularity", "%u",
+					q->limits.discard_granularity);
+				if (err) {
+					xenbus_dev_fatal(dev, err,
+						"writing discard-granularity");
+					goto kfree;
+				}
+				err = xenbus_printf(xbt, dev->nodename,
+					"discard-alignment", "%u",
+					q->limits.discard_alignment);
+				if (err) {
+					xenbus_dev_fatal(dev, err,
+						"writing discard-alignment");
+					goto kfree;
+				}
+				state = 1;
+				blkif->blk_backend_type = BLKIF_BACKEND_PHY;
+			}
+			/* Optional. */
+			err = xenbus_printf(xbt, dev->nodename,
+					    "discard-secure", "%d",
+					    blkif->vbd.discard_secure);
+			if (err) {
+				xenbus_dev_fatal(dev, err,
+						 "writing discard-secure");
+				goto kfree;
+			}
+		}
+	} else {
+		err = PTR_ERR(type);
+		xenbus_dev_fatal(dev, err, "reading type");
+		goto out;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-discard",
+			    "%d", state);
+	if (err)
+		xenbus_dev_fatal(dev, err, "writing feature-discard");
+kfree:
+	kfree(type);
+out:
+	return err;
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers.  Switch to InitWait.
+ */
+static int blkback_probe(struct xenbus_device *dev,
+			 const struct xenbus_device_id *id)
+{
+	int err;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+
+	be->blkif = blkif_alloc(dev->otherend_id);
+	if (IS_ERR(be->blkif)) {
+		err = PTR_ERR(be->blkif);
+		be->blkif = NULL;
+		xenbus_dev_fatal(dev, err, "creating block interface");
+		goto fail;
+	}
+
+	/* setup back pointer */
+	be->blkif->be = be;
+
+	err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
+				 &be->backend_watch, backend_changed);
+	if (err)
+		goto fail;
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	DPRINTK("failed");
+	blkback_remove(dev);
+	return err;
+}
+
+
+/**
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node.  Read it and the mode node, and create a vbd.  If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	int err;
+	unsigned major;
+	unsigned minor;
+	struct backend_info *be
+		= container_of(watch, struct backend_info, backend_watch);
+	struct xenbus_device *dev = be->dev;
+	int cdrom = 0;
+	char *device_type;
+
+	DPRINTK("");
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+			   &major, &minor);
+	if (XENBUS_EXIST_ERR(err)) {
+		/* Since this watch will fire once immediately after it is
+		   registered, we expect this.  Ignore it, and wait for the
+		   hotplug scripts. */
+		return;
+	}
+	if (err != 2) {
+		xenbus_dev_fatal(dev, err, "reading physical-device");
+		return;
+	}
+
+	if ((be->major || be->minor) &&
+	    ((be->major != major) || (be->minor != minor))) {
+		pr_warning("blkback: changing physical device (from %x:%x to"
+			   " %x:%x) not supported\n", be->major, be->minor,
+			   major, minor);
+		return;
+	}
+
+	be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+	if (IS_ERR(be->mode)) {
+		err = PTR_ERR(be->mode);
+		be->mode = NULL;
+		xenbus_dev_fatal(dev, err, "reading mode");
+		return;
+	}
+
+	device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+	if (!IS_ERR(device_type)) {
+		cdrom = strcmp(device_type, "cdrom") == 0;
+		kfree(device_type);
+	}
+
+	if (be->major == 0 && be->minor == 0) {
+		/* Front end dir is a number, which is used as the handle. */
+
+		char *p = strrchr(dev->otherend, '/') + 1;
+		long handle = simple_strtoul(p, NULL, 0);
+
+		be->major = major;
+		be->minor = minor;
+
+		err = vbd_create(be->blkif, handle, major, minor,
+				 FMODE_READ
+				 | (strchr(be->mode, 'w') ? FMODE_WRITE : 0)
+				 | (strchr(be->mode, '!') ? 0 : FMODE_EXCL),
+				 cdrom);
+		switch (err) {
+		case -ENOMEDIUM:
+			if (be->blkif->vbd.type
+			    & (VDISK_CDROM | VDISK_REMOVABLE))
+		case 0:
+				break;
+		default:
+			be->major = be->minor = 0;
+			xenbus_dev_fatal(dev, err, "creating vbd structure");
+			return;
+		}
+
+		err = xenvbd_sysfs_addif(dev);
+		if (err) {
+			vbd_free(&be->blkif->vbd);
+			be->major = be->minor = 0;
+			xenbus_dev_fatal(dev, err, "creating sysfs entries");
+			return;
+		}
+
+		/* We're potentially connected now */
+		update_blkif_status(be->blkif);
+
+		/* Add watch for cdrom media status if necessay */
+		cdrom_add_media_watch(be);
+	}
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	int err;
+
+	DPRINTK("%s", xenbus_strstate(frontend_state));
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			pr_info("%s: %s: prepare for reconnect\n",
+				__FUNCTION__, dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+		/* Ensure we connect even when two watches fire in 
+		   close successsion and we miss the intermediate value 
+		   of frontend_state. */
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		/* Enforce precondition before potential leak point.
+		 * blkif_disconnect() is idempotent.
+		 */
+		blkif_disconnect(be->blkif);
+
+		err = connect_ring(be);
+		if (err)
+			break;
+		if (be->blkif->vbd.bdev)
+			update_blkif_status(be->blkif);
+		break;
+
+	case XenbusStateClosing:
+		blkif_disconnect(be->blkif);
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		/* implies blkif_disconnect() via blkback_remove() */
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+
+/* ** Connection ** */
+
+
+/**
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = be->dev;
+
+	DPRINTK("%s", dev->otherend);
+
+	/* Supply the information about the device the frontend needs */
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		return;
+	}
+
+	err = blkback_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
+	if (err)
+		goto abort;
+
+	err = blkback_barrier(xbt, be, be->blkif->vbd.flush_support);
+	if (err)
+		goto abort;
+
+	err = xen_blkbk_discard(xbt, be);
+
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+			    vbd_size(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sectors",
+				 dev->nodename);
+		goto abort;
+	}
+
+	/* FIXME: use a typename instead */
+	err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+			    be->blkif->vbd.type);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/info",
+				 dev->nodename);
+		goto abort;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+			    vbd_secsize(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+				 dev->nodename);
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		xenbus_dev_fatal(dev, err, "ending transaction");
+
+	err = xenbus_switch_state(dev, XenbusStateConnected);
+	if (err)
+		xenbus_dev_fatal(dev, err, "%s: switching to Connected state",
+				 dev->nodename);
+
+	return;
+ abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned int ring_ref, evtchn;
+	char *protocol;
+	int err;
+
+	DPRINTK("%s", dev->otherend);
+
+	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%u", &ring_ref,
+			    "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+			    NULL, &protocol, NULL);
+	if (err) {
+		protocol = NULL;
+		be->blkif->blk_protocol = xen_guest_blkif_protocol(be->blkif->domid);
+	}
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+	else {
+		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+		kfree(protocol);
+		return -1;
+	}
+	pr_info("blkback: ring-ref %u, event-channel %u, protocol %d (%s)\n",
+		ring_ref, evtchn, be->blkif->blk_protocol,
+		protocol ?: "unspecified");
+	kfree(protocol);
+
+	/* Map the shared frame, irq etc. */
+	err = blkif_map(be->blkif, ring_ref, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "mapping ring-ref %u port %u",
+				 ring_ref, evtchn);
+		return err;
+	}
+
+	return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkback_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+static DEFINE_XENBUS_DRIVER(blkback, ,
+	.probe = blkback_probe,
+	.remove = blkback_remove,
+	.otherend_changed = frontend_changed
+);
+
+
+void blkif_xenbus_init(void)
+{
+	WARN_ON(xenbus_register_backend(&blkback_driver));
+}
diff --git a/drivers/xen/blkfront/Makefile b/drivers/xen/blkfront/Makefile
new file mode 100644
index 0000000..1ca0bed
--- /dev/null
+++ b/drivers/xen/blkfront/Makefile
@@ -0,0 +1,5 @@
+
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	:= xenblk.o
+
+xenblk-objs := blkfront.o vbd.o vcd.o
+
diff --git a/drivers/xen/blkfront/blkfront.c b/drivers/xen/blkfront/blkfront.c
new file mode 100644
index 0000000..80b7d84
--- /dev/null
+++ b/drivers/xen/blkfront/blkfront.c
@@ -0,0 +1,1150 @@
+/******************************************************************************
+ * blkfront.c
+ * 
+ * XenLinux virtual block-device driver.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2005, XenSource Ltd
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include "block.h"
+#include <linux/cdrom.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <scsi/scsi.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/protocols.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include <asm/maddr.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define BLKIF_STATE_DISCONNECTED 0
+#define BLKIF_STATE_CONNECTED    1
+#define BLKIF_STATE_SUSPENDED    2
+
+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+#define GRANT_INVALID_REF	0
+
+static void connect(struct blkfront_info *);
+static void blkfront_closing(struct blkfront_info *);
+static int blkfront_remove(struct xenbus_device *);
+static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
+static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
+
+static void kick_pending_request_queues(struct blkfront_info *);
+
+static irqreturn_t blkif_int(int irq, void *dev_id);
+static void blkif_restart_queue(struct work_struct *arg);
+static int blkif_recover(struct blkfront_info *);
+static void blkif_completion(struct blk_shadow *);
+static void blkif_free(struct blkfront_info *, int);
+
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffer for communication with the backend, and
+ * inform the backend of the appropriate details for those.  Switch to
+ * Initialised state.
+ */
+static int blkfront_probe(struct xenbus_device *dev,
+			  const struct xenbus_device_id *id)
+{
+	int err, vdevice, i;
+	struct blkfront_info *info;
+
+#ifndef CONFIG_XEN /* For HVM guests, do not take over CDROM devices. */
+	char *type;
+
+	type = xenbus_read(XBT_NIL, dev->nodename, "device-type", NULL);
+	if (IS_ERR(type)) {
+		xenbus_dev_fatal(dev, PTR_ERR(type), "reading dev type");
+		return PTR_ERR(type);
+	}
+	if (!strncmp(type, "cdrom", 5)) {
+		/*
+		 * We are handed a cdrom device in a hvm guest; let the
+		 * native cdrom driver handle this device.
+		 */
+		kfree(type);
+		pr_notice("blkfront: ignoring CDROM %s\n", dev->nodename);
+		return -ENXIO;
+	}
+	kfree(type);
+#endif
+
+	/* FIXME: Use dynamic device id if this is not set. */
+	err = xenbus_scanf(XBT_NIL, dev->nodename,
+			   "virtual-device", "%i", &vdevice);
+	if (err != 1) {
+		/* go looking in the extended area instead */
+		err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
+			"%i", &vdevice);
+		if (err != 1) {
+			xenbus_dev_fatal(dev, err, "reading virtual-device");
+			return err;
+		}
+	}
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&info->io_lock);
+	info->xbdev = dev;
+	info->vdevice = vdevice;
+	info->connected = BLKIF_STATE_DISCONNECTED;
+	INIT_WORK(&info->work, blkif_restart_queue);
+
+	for (i = 0; i < BLK_RING_SIZE; i++)
+		info->shadow[i].req.id = i+1;
+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+	/* Front end dir is a number, which is used as the id. */
+	info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
+	dev_set_drvdata(&dev->dev, info);
+
+	err = talk_to_backend(dev, info);
+	if (err) {
+		kfree(info);
+		dev_set_drvdata(&dev->dev, NULL);
+		return err;
+	}
+
+	return 0;
+}
+
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our blkif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int blkfront_resume(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+	int err;
+
+	DPRINTK("blkfront_resume: %s\n", dev->nodename);
+
+	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+
+	err = talk_to_backend(dev, info);
+	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
+		err = blkif_recover(info);
+
+	return err;
+}
+
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+			   struct blkfront_info *info)
+{
+	const char *message = NULL;
+	struct xenbus_transaction xbt;
+	int err;
+
+	/* Create shared ring, alloc event channel. */
+	err = setup_blkring(dev, info);
+	if (err)
+		goto out;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_blkring;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename,
+			    "ring-ref","%u", info->ring_ref);
+	if (err) {
+		message = "writing ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+			    irq_to_evtchn_port(info->irq));
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
+			    XEN_IO_PROTO_ABI_NATIVE);
+	if (err) {
+		message = "writing protocol";
+		goto abort_transaction;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_blkring;
+	}
+
+	xenbus_switch_state(dev, XenbusStateInitialised);
+
+	return 0;
+
+ abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	if (message)
+		xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_blkring:
+	blkif_free(info, 0);
+ out:
+	return err;
+}
+
+
+static int setup_blkring(struct xenbus_device *dev,
+			 struct blkfront_info *info)
+{
+	blkif_sring_t *sring;
+	int err;
+
+	info->ring_ref = GRANT_INVALID_REF;
+
+	sring = (blkif_sring_t *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+	if (!sring) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+		return -ENOMEM;
+	}
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+	sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+	if (err < 0) {
+		free_page((unsigned long)sring);
+		info->ring.sring = NULL;
+		goto fail;
+	}
+	info->ring_ref = err;
+
+	err = bind_listening_port_to_irqhandler(
+		dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info);
+	if (err <= 0) {
+		xenbus_dev_fatal(dev, err,
+				 "bind_listening_port_to_irqhandler");
+		goto fail;
+	}
+	info->irq = err;
+
+	return 0;
+fail:
+	blkif_free(info, 0);
+	return err;
+}
+
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+			    enum xenbus_state backend_state)
+{
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+	struct block_device *bd;
+
+	DPRINTK("blkfront:backend_changed.\n");
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateConnected:
+		connect(info);
+		break;
+
+	case XenbusStateClosing:
+		if (!info->gd) {
+			xenbus_frontend_closed(dev);
+			break;
+		}
+		bd = bdget_disk(info->gd, 0);
+		if (bd == NULL) {
+			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
+			break;
+		}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+		down(&bd->bd_sem);
+#else
+		mutex_lock(&bd->bd_mutex);
+#endif
+		if (info->users > 0)
+			xenbus_dev_error(dev, -EBUSY,
+					 "Device in use; refusing to close");
+		else
+			blkfront_closing(info);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+		up(&bd->bd_sem);
+#else
+		mutex_unlock(&bd->bd_mutex);
+#endif
+		bdput(bd);
+		break;
+	}
+}
+
+
+/* ** Connection ** */
+
+static void blkfront_setup_discard(struct blkfront_info *info)
+{
+	int err;
+	char *type;
+	unsigned int discard_granularity;
+	unsigned int discard_alignment;
+	int discard_secure;
+
+	type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
+	if (IS_ERR(type))
+		return;
+
+	info->feature_secdiscard = 0;
+	if (strncmp(type, "phy", 3) == 0) {
+		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			"discard-granularity", "%u", &discard_granularity,
+			"discard-alignment", "%u", &discard_alignment,
+			NULL);
+		if (!err) {
+			info->feature_discard = 1;
+			info->discard_granularity = discard_granularity;
+			info->discard_alignment = discard_alignment;
+		}
+		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			    "discard-secure", "%d", &discard_secure);
+		if (err == 1)
+			info->feature_secdiscard = discard_secure;
+	} else if (strncmp(type, "file", 4) == 0)
+		info->feature_discard = 1;
+
+	kfree(type);
+}
+
+/*
+ * Invoked when the backend is finally 'ready' (and has told produced
+ * the details about the physical device - #sectors, size, etc).
+ */
+static void connect(struct blkfront_info *info)
+{
+	unsigned long long sectors;
+	unsigned long sector_size;
+	unsigned int binfo;
+	int err, barrier, flush, discard;
+
+	switch (info->connected) {
+	case BLKIF_STATE_CONNECTED:
+		/*
+		 * Potentially, the back-end may be signalling
+		 * a capacity change; update the capacity.
+		 */
+		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+				   "sectors", "%Lu", &sectors);
+		if (err != 1)
+			return;
+		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+				   "sector-size", "%lu", &sector_size);
+		if (err != 1)
+			sector_size = 0;
+		if (sector_size)
+			blk_queue_logical_block_size(info->gd->queue,
+						     sector_size);
+		pr_info("Setting capacity to %Lu\n", sectors);
+		set_capacity(info->gd, sectors);
+		revalidate_disk(info->gd);
+
+		/* fall through */
+	case BLKIF_STATE_SUSPENDED:
+		return;
+	}
+
+	DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "sectors", "%Lu", &sectors,
+			    "info", "%u", &binfo,
+			    "sector-size", "%lu", &sector_size,
+			    NULL);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err,
+				 "reading backend fields at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	info->feature_flush = 0;
+	info->flush_op = 0;
+
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "feature-barrier", "%d", &barrier);
+	/*
+	 * If there's no "feature-barrier" defined, then it means
+	 * we're dealing with a very old backend which writes
+	 * synchronously; nothing to do.
+	 *
+	 * If there are barriers, then we use flush.
+	 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+	if (err > 0 && barrier) {
+		info->feature_flush = REQ_FLUSH | REQ_FUA;
+		info->flush_op = BLKIF_OP_WRITE_BARRIER;
+	}
+	/*
+	 * And if there is "feature-flush-cache" use that above
+	 * barriers.
+	 */
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "feature-flush-cache", "%d", &flush);
+	if (err > 0 && flush) {
+		info->feature_flush = REQ_FLUSH;
+		info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
+	}
+#else
+	if (err <= 0)
+		info->feature_flush = QUEUE_ORDERED_DRAIN;
+	else if (barrier)
+		info->feature_flush = QUEUE_ORDERED_TAG;
+	else
+		info->feature_flush = QUEUE_ORDERED_NONE;
+#endif
+
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "feature-discard", "%d", &discard);
+
+	if (err > 0 && discard)
+		blkfront_setup_discard(info);
+
+	err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	err = xlvbd_sysfs_addif(info);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err, "xlvbd_sysfs_addif at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	(void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+	/* Kick pending requests. */
+	spin_lock_irq(&info->io_lock);
+	info->connected = BLKIF_STATE_CONNECTED;
+	kick_pending_request_queues(info);
+	spin_unlock_irq(&info->io_lock);
+
+	add_disk(info->gd);
+
+	info->is_ready = 1;
+
+	register_vcd(info);
+}
+
+/**
+ * Handle the change of state of the backend to Closing.  We must delete our
+ * device-layer structures now, to ensure that writes are flushed through to
+ * the backend.  Once is this done, we can switch to Closed in
+ * acknowledgement.
+ */
+static void blkfront_closing(struct blkfront_info *info)
+{
+	unsigned long flags;
+
+	DPRINTK("blkfront_closing: %d removed\n", info->vdevice);
+
+	if (info->rq == NULL)
+		goto out;
+
+	spin_lock_irqsave(&info->io_lock, flags);
+	/* No more blkif_request(). */
+	blk_stop_queue(info->rq);
+	/* No more gnttab callback work. */
+	gnttab_cancel_free_callback(&info->callback);
+	spin_unlock_irqrestore(&info->io_lock, flags);
+
+	/* Flush gnttab callback work. Must be done with no locks held. */
+	flush_work_sync(&info->work);
+
+	xlvbd_sysfs_delif(info);
+
+	unregister_vcd(info);
+
+	xlvbd_del(info);
+
+ out:
+	if (info->xbdev)
+		xenbus_frontend_closed(info->xbdev);
+}
+
+
+static int blkfront_remove(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
+
+	blkif_free(info, 0);
+
+	if(info->users == 0)
+		kfree(info);
+	else
+		info->xbdev = NULL;
+
+	return 0;
+}
+
+
+static inline int GET_ID_FROM_FREELIST(
+	struct blkfront_info *info)
+{
+	unsigned long free = info->shadow_free;
+	BUG_ON(free >= BLK_RING_SIZE);
+	info->shadow_free = info->shadow[free].req.id;
+	info->shadow[free].req.id = 0x0fffffee; /* debug */
+	return free;
+}
+
+static inline void ADD_ID_TO_FREELIST(
+	struct blkfront_info *info, unsigned long id)
+{
+	info->shadow[id].req.id  = info->shadow_free;
+	info->shadow[id].request = NULL;
+	info->shadow_free = id;
+}
+
+static inline void flush_requests(struct blkfront_info *info)
+{
+	int notify;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+
+	if (notify)
+		notify_remote_via_irq(info->irq);
+}
+
+static void kick_pending_request_queues(struct blkfront_info *info)
+{
+	if (!RING_FULL(&info->ring)) {
+		/* Re-enable calldowns. */
+		blk_start_queue(info->rq);
+		/* Kick things off immediately. */
+		do_blkif_request(info->rq);
+	}
+}
+
+static void blkif_restart_queue(struct work_struct *arg)
+{
+	struct blkfront_info *info = container_of(arg, struct blkfront_info, work);
+	spin_lock_irq(&info->io_lock);
+	if (info->connected == BLKIF_STATE_CONNECTED)
+		kick_pending_request_queues(info);
+	spin_unlock_irq(&info->io_lock);
+}
+
+static void blkif_restart_queue_callback(void *arg)
+{
+	struct blkfront_info *info = (struct blkfront_info *)arg;
+	schedule_work(&info->work);
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+int blkif_open(struct inode *inode, struct file *filep)
+{
+	struct block_device *bd = inode->i_bdev;
+#else
+int blkif_open(struct block_device *bd, fmode_t mode)
+{
+#endif
+	struct blkfront_info *info = bd->bd_disk->private_data;
+
+	if (!info->xbdev)
+		return -ENODEV;
+	info->users++;
+	return 0;
+}
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+int blkif_release(struct inode *inode, struct file *filep)
+{
+	struct gendisk *disk = inode->i_bdev->bd_disk;
+#else
+int blkif_release(struct gendisk *disk, fmode_t mode)
+{
+#endif
+	struct blkfront_info *info = disk->private_data;
+
+	info->users--;
+	if (info->users == 0) {
+		/* Check whether we have been instructed to close.  We will
+		   have ignored this request initially, as the device was
+		   still mounted. */
+		struct xenbus_device * dev = info->xbdev;
+
+		if (!dev) {
+			blkfront_closing(info);
+			kfree(info);
+		} else if (xenbus_read_driver_state(dev->otherend)
+			   == XenbusStateClosing && info->is_ready)
+			blkfront_closing(info);
+	}
+	return 0;
+}
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+int blkif_ioctl(struct inode *inode, struct file *filep,
+		unsigned command, unsigned long argument)
+{
+	struct block_device *bd = inode->i_bdev;
+#else
+int blkif_ioctl(struct block_device *bd, fmode_t mode,
+		unsigned command, unsigned long argument)
+{
+#endif
+	struct blkfront_info *info = bd->bd_disk->private_data;
+	int i;
+
+	DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+		      command, (long)argument, inode->i_rdev);
+
+	switch (command) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+	case HDIO_GETGEO: {
+		struct hd_geometry geo;
+		int ret;
+
+                if (!argument)
+                        return -EINVAL;
+
+		geo.start = get_start_sect(bd);
+		ret = blkif_getgeo(bd, &geo);
+		if (ret)
+			return ret;
+
+		if (copy_to_user((struct hd_geometry __user *)argument, &geo,
+				 sizeof(geo)))
+                        return -EFAULT;
+
+                return 0;
+	}
+#endif
+	case CDROMMULTISESSION:
+		DPRINTK("FIXME: support multisession CDs later\n");
+		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+			if (put_user(0, (char __user *)(argument + i)))
+				return -EFAULT;
+		return 0;
+
+	case CDROM_GET_CAPABILITY:
+		if (info->gd && (info->gd->flags & GENHD_FL_CD))
+			return 0;
+		return -EINVAL;
+
+	default:
+		if (info->mi && info->gd && info->rq) {
+			switch (info->mi->major) {
+			case SCSI_DISK0_MAJOR:
+			case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
+		        case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
+		        case SCSI_CDROM_MAJOR:
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+				return scsi_cmd_ioctl(filep, info->gd, command,
+						      (void __user *)argument);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+				return scsi_cmd_ioctl(filep, info->rq,
+						      info->gd, command,
+						      (void __user *)argument);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
+				return scsi_cmd_ioctl(info->rq, info->gd,
+						      mode, command,
+						      (void __user *)argument);
+#else
+				return scsi_cmd_blk_ioctl(bd, mode, command,
+							  (void __user *)argument);
+#endif
+			}
+		}
+
+		return -EINVAL; /* same return as native Linux */
+	}
+
+	return 0;
+}
+
+
+int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+	/* We don't have real geometry info, but let's at least return
+	   values consistent with the size of the device */
+	sector_t nsect = get_capacity(bd->bd_disk);
+	sector_t cylinders = nsect;
+
+	hg->heads = 0xff;
+	hg->sectors = 0x3f;
+	sector_div(cylinders, hg->heads * hg->sectors);
+	hg->cylinders = cylinders;
+	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+		hg->cylinders = 0xffff;
+	return 0;
+}
+
+
+/*
+ * Generate a Xen blkfront IO request from a blk layer request.  Reads
+ * and writes are handled as expected.
+ *
+ * @req: a request struct
+ */
+static int blkif_queue_request(struct request *req)
+{
+	struct blkfront_info *info = req->rq_disk->private_data;
+	unsigned long buffer_mfn;
+	blkif_request_t *ring_req;
+	unsigned long id;
+	unsigned int fsect, lsect;
+	int i, ref;
+	grant_ref_t gref_head;
+	struct scatterlist *sg;
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+		return 1;
+
+	if (gnttab_alloc_grant_references(
+		BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
+		gnttab_request_free_callback(
+			&info->callback,
+			blkif_restart_queue_callback,
+			info,
+			BLKIF_MAX_SEGMENTS_PER_REQUEST);
+		return 1;
+	}
+
+	/* Fill out a communications ring structure. */
+	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+	id = GET_ID_FROM_FREELIST(info);
+	info->shadow[id].request = req;
+
+	ring_req->id = id;
+	ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
+	ring_req->handle = info->handle;
+
+	ring_req->operation = rq_data_dir(req) ?
+		BLKIF_OP_WRITE : BLKIF_OP_READ;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+	if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
+#else
+	if (req->cmd_flags & REQ_HARDBARRIER)
+#endif
+		ring_req->operation = info->flush_op;
+	if (req->cmd_type == REQ_TYPE_BLOCK_PC)
+		ring_req->operation = BLKIF_OP_PACKET;
+
+	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
+		struct blkif_request_discard *discard = (void *)ring_req;
+
+		/* id, sector_number and handle are set above. */
+		discard->operation = BLKIF_OP_DISCARD;
+		discard->flag = 0;
+		discard->nr_sectors = blk_rq_sectors(req);
+		if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+			discard->flag = BLKIF_DISCARD_SECURE;
+	} else {
+		ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
+		BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+		for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
+			buffer_mfn = page_to_phys(sg_page(sg)) >> PAGE_SHIFT;
+			fsect = sg->offset >> 9;
+			lsect = fsect + (sg->length >> 9) - 1;
+			/* install a grant reference. */
+			ref = gnttab_claim_grant_reference(&gref_head);
+			BUG_ON(ref == -ENOSPC);
+
+			gnttab_grant_foreign_access_ref(
+				ref,
+				info->xbdev->otherend_id,
+				buffer_mfn,
+				rq_data_dir(req) ? GTF_readonly : 0 );
+
+			info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
+			ring_req->seg[i] =
+				(struct blkif_request_segment) {
+					.gref       = ref,
+					.first_sect = fsect,
+					.last_sect  = lsect };
+		}
+	}
+
+	info->ring.req_prod_pvt++;
+
+	/* Keep a private copy so we can reissue requests when recovering. */
+	info->shadow[id].req = *ring_req;
+
+	gnttab_free_grant_references(gref_head);
+
+	return 0;
+}
+
+/*
+ * do_blkif_request
+ *  read a block; request is in a request queue
+ */
+void do_blkif_request(struct request_queue *rq)
+{
+	struct blkfront_info *info = NULL;
+	struct request *req;
+	int queued;
+
+	DPRINTK("Entered do_blkif_request\n");
+
+	queued = 0;
+
+	while ((req = blk_peek_request(rq)) != NULL) {
+		info = req->rq_disk->private_data;
+
+		if (RING_FULL(&info->ring))
+			goto wait;
+
+		blk_start_request(req);
+
+		if ((req->cmd_type != REQ_TYPE_FS &&
+		     (req->cmd_type != REQ_TYPE_BLOCK_PC || req->cmd_len)) ||
+		    ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
+		     !info->flush_op)) {
+			req->errors = (DID_ERROR << 16) |
+				      (DRIVER_INVALID << 24);
+			__blk_end_request_all(req, -EIO);
+			continue;
+		}
+
+		DPRINTK("do_blk_req %p: cmd %p, sec %llx, "
+			"(%u/%u) buffer:%p [%s]\n",
+			req, req->cmd, (long long)blk_rq_pos(req),
+			blk_rq_cur_sectors(req), blk_rq_sectors(req),
+			req->buffer, rq_data_dir(req) ? "write" : "read");
+
+		if (blkif_queue_request(req)) {
+			blk_requeue_request(rq, req);
+		wait:
+			/* Avoid pointless unplugs. */
+			blk_stop_queue(rq);
+			break;
+		}
+
+		queued++;
+	}
+
+	if (queued != 0)
+		flush_requests(info);
+}
+
+
+static irqreturn_t blkif_int(int irq, void *dev_id)
+{
+	struct request *req;
+	blkif_response_t *bret;
+	RING_IDX i, rp;
+	unsigned long flags;
+	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+
+	spin_lock_irqsave(&info->io_lock, flags);
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+		spin_unlock_irqrestore(&info->io_lock, flags);
+		return IRQ_HANDLED;
+	}
+
+ again:
+	rp = info->ring.sring->rsp_prod;
+	rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+	for (i = info->ring.rsp_cons; i != rp; i++) {
+		unsigned long id;
+		int ret;
+
+		bret = RING_GET_RESPONSE(&info->ring, i);
+		id   = bret->id;
+		req  = info->shadow[id].request;
+
+		blkif_completion(&info->shadow[id]);
+
+		ADD_ID_TO_FREELIST(info, id);
+
+		ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
+		switch (bret->operation) {
+			const char *what;
+
+		case BLKIF_OP_FLUSH_DISKCACHE:
+		case BLKIF_OP_WRITE_BARRIER:
+			what = bret->operation == BLKIF_OP_WRITE_BARRIER ?
+			       "write barrier" : "flush disk cache";
+			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+				pr_warn("blkfront: %s: %s op failed\n",
+					what, info->gd->disk_name);
+				ret = -EOPNOTSUPP;
+			}
+			if (unlikely(bret->status == BLKIF_RSP_ERROR &&
+				     info->shadow[id].req.nr_segments == 0)) {
+				pr_warn("blkfront: %s: empty %s op failed\n",
+					what, info->gd->disk_name);
+				ret = -EOPNOTSUPP;
+			}
+			if (unlikely(ret)) {
+				if (ret == -EOPNOTSUPP)
+					ret = 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+				info->feature_flush = 0;
+#else
+				info->feature_flush = QUEUE_ORDERED_NONE;
+#endif
+			        xlvbd_flush(info);
+			}
+			/* fall through */
+		case BLKIF_OP_READ:
+		case BLKIF_OP_WRITE:
+		case BLKIF_OP_PACKET:
+			if (unlikely(bret->status != BLKIF_RSP_OKAY))
+				DPRINTK("Bad return from blkdev data "
+					"request: %x\n", bret->status);
+
+			__blk_end_request_all(req, ret);
+			break;
+		case BLKIF_OP_DISCARD:
+			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+				struct request_queue *rq = info->rq;
+
+				pr_warn("blkfront: %s: discard op failed\n",
+					info->gd->disk_name);
+				ret = -EOPNOTSUPP;
+				info->feature_discard = 0;
+				info->feature_secdiscard = 0;
+				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
+			}
+			__blk_end_request_all(req, ret);
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	info->ring.rsp_cons = i;
+
+	if (i != info->ring.req_prod_pvt) {
+		int more_to_do;
+		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+		if (more_to_do)
+			goto again;
+	} else
+		info->ring.sring->rsp_event = i + 1;
+
+	kick_pending_request_queues(info);
+
+	spin_unlock_irqrestore(&info->io_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+	/* Prevent new requests being issued until we fix things up. */
+	spin_lock_irq(&info->io_lock);
+	info->connected = suspend ?
+		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+	/* No more blkif_request(). */
+	if (info->rq)
+		blk_stop_queue(info->rq);
+	/* No more gnttab callback work. */
+	gnttab_cancel_free_callback(&info->callback);
+	spin_unlock_irq(&info->io_lock);
+
+	/* Flush gnttab callback work. Must be done with no locks held. */
+	flush_work_sync(&info->work);
+
+	/* Free resources associated with old device channel. */
+	if (info->ring_ref != GRANT_INVALID_REF) {
+		gnttab_end_foreign_access(info->ring_ref, 
+					  (unsigned long)info->ring.sring);
+		info->ring_ref = GRANT_INVALID_REF;
+		info->ring.sring = NULL;
+	}
+	if (info->irq)
+		unbind_from_irqhandler(info->irq, info);
+	info->irq = 0;
+}
+
+static void blkif_completion(struct blk_shadow *s)
+{
+	int i;
+
+	if (s->req.operation == BLKIF_OP_DISCARD)
+		return;
+	for (i = 0; i < s->req.nr_segments; i++)
+		gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
+}
+
+static int blkif_recover(struct blkfront_info *info)
+{
+	int i;
+	blkif_request_t *req;
+	struct blk_shadow *copy;
+	int j;
+
+	/* Stage 1: Make a safe copy of the shadow state. */
+	copy = kmemdup(info->shadow, sizeof(info->shadow),
+		       GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH);
+	if (!copy)
+		return -ENOMEM;
+
+	/* Stage 2: Set up free list. */
+	memset(&info->shadow, 0, sizeof(info->shadow));
+	for (i = 0; i < BLK_RING_SIZE; i++)
+		info->shadow[i].req.id = i+1;
+	info->shadow_free = info->ring.req_prod_pvt;
+	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+	/* Stage 3: Find pending requests and requeue them. */
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		/* Not in use? */
+		if (!copy[i].request)
+			continue;
+
+		/* Grab a request slot and copy shadow state into it. */
+		req = RING_GET_REQUEST(
+			&info->ring, info->ring.req_prod_pvt);
+		*req = copy[i].req;
+
+		/* We get a new request id, and must reset the shadow state. */
+		req->id = GET_ID_FROM_FREELIST(info);
+		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+
+		/* Rewrite any grant references invalidated by susp/resume. */
+		for (j = 0; j < req->nr_segments; j++)
+			gnttab_grant_foreign_access_ref(
+				req->seg[j].gref,
+				info->xbdev->otherend_id,
+				pfn_to_mfn(info->shadow[req->id].frame[j]),
+				rq_data_dir(info->shadow[req->id].request) ?
+				GTF_readonly : 0);
+		info->shadow[req->id].req = *req;
+
+		info->ring.req_prod_pvt++;
+	}
+
+	kfree(copy);
+
+	(void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+	spin_lock_irq(&info->io_lock);
+
+	/* Now safe for us to use the shared ring */
+	info->connected = BLKIF_STATE_CONNECTED;
+
+	/* Send off requeued requests */
+	flush_requests(info);
+
+	/* Kick any other new requests queued since we resumed */
+	kick_pending_request_queues(info);
+
+	spin_unlock_irq(&info->io_lock);
+
+	return 0;
+}
+
+int blkfront_is_ready(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+
+	return info->is_ready && info->xbdev;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkfront_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+MODULE_ALIAS("xen:vbd");
+
+static DEFINE_XENBUS_DRIVER(blkfront, ,
+	.probe = blkfront_probe,
+	.remove = blkfront_remove,
+	.resume = blkfront_resume,
+	.otherend_changed = backend_changed,
+	.is_ready = blkfront_is_ready,
+);
+
+
+static int __init xlblk_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	return xenbus_register_frontend(&blkfront_driver);
+}
+module_init(xlblk_init);
+
+
+static void __exit xlblk_exit(void)
+{
+	return xenbus_unregister_driver(&blkfront_driver);
+}
+module_exit(xlblk_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blkfront/block.h b/drivers/xen/blkfront/block.h
new file mode 100644
index 0000000..ef4a37f
--- /dev/null
+++ b/drivers/xen/blkfront/block.h
@@ -0,0 +1,174 @@
+/******************************************************************************
+ * block.h
+ * 
+ * Shared definitions between all levels of XenLinux Virtual block devices.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004-2005, Christian Limpach
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_DRIVERS_BLOCK_H__
+#define __XEN_DRIVERS_BLOCK_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/atomic.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/major.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/gnttab.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/ring.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) pr_alert(_f, ## _a)
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+struct xlbd_type_info
+{
+	int partn_shift;
+	int disks_per_major;
+	char *devname;
+	char *diskname;
+};
+
+struct xlbd_major_info
+{
+	int major;
+	int index;
+	int usage;
+	struct xlbd_type_info *type;
+	struct xlbd_minor_state *minors;
+};
+
+struct blk_shadow {
+	blkif_request_t req;
+	struct request *request;
+	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.  They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct blkfront_info
+{
+	struct xenbus_device *xbdev;
+ 	struct gendisk *gd;
+	int vdevice;
+	blkif_vdev_t handle;
+	int connected;
+	int ring_ref;
+	blkif_front_ring_t ring;
+	spinlock_t io_lock;
+	struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int irq;
+	struct xlbd_major_info *mi;
+	struct request_queue *rq;
+	struct work_struct work;
+	struct gnttab_free_callback callback;
+	struct blk_shadow shadow[BLK_RING_SIZE];
+	unsigned long shadow_free;
+	unsigned int feature_flush;
+	unsigned int flush_op;
+	bool feature_discard;
+	bool feature_secdiscard;
+	unsigned int discard_granularity;
+	unsigned int discard_alignment;
+	int is_ready;
+
+	/**
+	 * The number of people holding this device open.  We won't allow a
+	 * hot-unplug unless this is 0.
+	 */
+	int users;
+};
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+extern int blkif_open(struct inode *inode, struct file *filep);
+extern int blkif_release(struct inode *inode, struct file *filep);
+extern int blkif_ioctl(struct inode *inode, struct file *filep,
+		       unsigned command, unsigned long argument);
+#else
+extern int blkif_open(struct block_device *bdev, fmode_t mode);
+extern int blkif_release(struct gendisk *disk, fmode_t mode);
+extern int blkif_ioctl(struct block_device *bdev, fmode_t mode,
+ 		       unsigned command, unsigned long argument);
+#endif
+extern int blkif_getgeo(struct block_device *, struct hd_geometry *);
+extern int blkif_check(dev_t dev);
+extern int blkif_revalidate(dev_t dev);
+extern void do_blkif_request (struct request_queue *rq);
+
+/* Virtual block-device subsystem. */
+/* Note that xlvbd_add doesn't call add_disk for you: you're expected
+   to call add_disk on info->gd once the disk is properly connected
+   up. */
+int xlvbd_add(blkif_sector_t capacity, int device,
+	      u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
+void xlvbd_del(struct blkfront_info *info);
+void xlvbd_flush(struct blkfront_info *info);
+
+#ifdef CONFIG_SYSFS
+int xlvbd_sysfs_addif(struct blkfront_info *info);
+void xlvbd_sysfs_delif(struct blkfront_info *info);
+#else
+static inline int xlvbd_sysfs_addif(struct blkfront_info *info)
+{
+	return 0;
+}
+
+static inline void xlvbd_sysfs_delif(struct blkfront_info *info)
+{
+	;
+}
+#endif
+
+/* Virtual cdrom block-device */
+extern void register_vcd(struct blkfront_info *info);
+extern void unregister_vcd(struct blkfront_info *info);
+
+#endif /* __XEN_DRIVERS_BLOCK_H__ */
diff --git a/drivers/xen/blkfront/vbd.c b/drivers/xen/blkfront/vbd.c
new file mode 100644
index 0000000..eb9cd71
--- /dev/null
+++ b/drivers/xen/blkfront/vbd.c
@@ -0,0 +1,554 @@
+/******************************************************************************
+ * vbd.c
+ * 
+ * XenLinux virtual block-device driver (xvd).
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004-2005, Christian Limpach
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "block.h"
+#include <linux/bitmap.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define BLKIF_MAJOR(dev) ((dev)>>8)
+#define BLKIF_MINOR(dev) ((dev) & 0xff)
+
+#define EXT_SHIFT 28
+#define EXTENDED (1<<EXT_SHIFT)
+#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
+#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+
+struct xlbd_minor_state {
+	unsigned int nr;
+	unsigned long *bitmap;
+	spinlock_t lock;
+};
+
+/*
+ * For convenience we distinguish between ide, scsi and 'other' (i.e.,
+ * potentially combinations of the two) in the naming scheme and in a few other
+ * places.
+ */
+
+#define NUM_IDE_MAJORS 10
+#define NUM_SCSI_MAJORS 17
+#define NUM_VBD_MAJORS 2
+
+static struct xlbd_type_info xlbd_ide_type = {
+	.partn_shift = 6,
+	.disks_per_major = 2,
+	.devname = "ide",
+	.diskname = "hd",
+};
+
+static struct xlbd_type_info xlbd_scsi_type = {
+	.partn_shift = 4,
+	.disks_per_major = 16,
+	.devname = "sd",
+	.diskname = "sd",
+};
+
+static struct xlbd_type_info xlbd_vbd_type = {
+	.partn_shift = 4,
+	.disks_per_major = 16,
+	.devname = "xvd",
+	.diskname = "xvd",
+};
+
+static struct xlbd_type_info xlbd_vbd_type_ext = {
+	.partn_shift = 8,
+	.disks_per_major = 256,
+	.devname = "xvd",
+	.diskname = "xvd",
+};
+
+static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS +
+					 NUM_VBD_MAJORS];
+
+#define XLBD_MAJOR_IDE_START	0
+#define XLBD_MAJOR_SCSI_START	(NUM_IDE_MAJORS)
+#define XLBD_MAJOR_VBD_START	(NUM_IDE_MAJORS + NUM_SCSI_MAJORS)
+
+#define XLBD_MAJOR_IDE_RANGE	XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1
+#define XLBD_MAJOR_SCSI_RANGE	XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1
+#define XLBD_MAJOR_VBD_RANGE	XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
+
+#define XLBD_MAJOR_VBD_ALT(idx) ((idx) ^ XLBD_MAJOR_VBD_START ^ (XLBD_MAJOR_VBD_START + 1))
+
+static const struct block_device_operations xlvbd_block_fops =
+{
+	.owner = THIS_MODULE,
+	.open = blkif_open,
+	.release = blkif_release,
+	.ioctl  = blkif_ioctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+	.getgeo = blkif_getgeo
+#endif
+};
+
+static struct xlbd_major_info *
+xlbd_alloc_major_info(int major, int minor, int index)
+{
+	struct xlbd_major_info *ptr;
+	struct xlbd_minor_state *minors;
+	int do_register;
+
+	ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
+	if (ptr == NULL)
+		return NULL;
+
+	ptr->major = major;
+	minors = kmalloc(sizeof(*minors), GFP_KERNEL);
+	if (minors == NULL) {
+		kfree(ptr);
+		return NULL;
+	}
+
+	minors->bitmap = kzalloc(BITS_TO_LONGS(256) * sizeof(*minors->bitmap),
+				 GFP_KERNEL);
+	if (minors->bitmap == NULL) {
+		kfree(minors);
+		kfree(ptr);
+		return NULL;
+	}
+
+	spin_lock_init(&minors->lock);
+	minors->nr = 256;
+	do_register = 1;
+
+	switch (index) {
+	case XLBD_MAJOR_IDE_RANGE:
+		ptr->type = &xlbd_ide_type;
+		ptr->index = index - XLBD_MAJOR_IDE_START;
+		break;
+	case XLBD_MAJOR_SCSI_RANGE:
+		ptr->type = &xlbd_scsi_type;
+		ptr->index = index - XLBD_MAJOR_SCSI_START;
+		break;
+	case XLBD_MAJOR_VBD_RANGE:
+		ptr->index = 0;
+		if ((index - XLBD_MAJOR_VBD_START) == 0)
+			ptr->type = &xlbd_vbd_type;
+		else
+			ptr->type = &xlbd_vbd_type_ext;
+
+		/* 
+		 * if someone already registered block major 202,
+		 * don't try to register it again
+		 */
+		if (major_info[XLBD_MAJOR_VBD_ALT(index)] != NULL) {
+			kfree(minors->bitmap);
+			kfree(minors);
+			minors = major_info[XLBD_MAJOR_VBD_ALT(index)]->minors;
+			do_register = 0;
+		}
+		break;
+	}
+
+	if (do_register) {
+		if (register_blkdev(ptr->major, ptr->type->devname)) {
+			kfree(minors->bitmap);
+			kfree(minors);
+			kfree(ptr);
+			return NULL;
+		}
+
+		pr_info("xen-vbd: registered block device major %i\n",
+			ptr->major);
+	}
+
+	ptr->minors = minors;
+	major_info[index] = ptr;
+	return ptr;
+}
+
+static struct xlbd_major_info *
+xlbd_get_major_info(int major, int minor, int vdevice)
+{
+	struct xlbd_major_info *mi;
+	int index;
+
+	switch (major) {
+	case IDE0_MAJOR: index = 0; break;
+	case IDE1_MAJOR: index = 1; break;
+	case IDE2_MAJOR: index = 2; break;
+	case IDE3_MAJOR: index = 3; break;
+	case IDE4_MAJOR: index = 4; break;
+	case IDE5_MAJOR: index = 5; break;
+	case IDE6_MAJOR: index = 6; break;
+	case IDE7_MAJOR: index = 7; break;
+	case IDE8_MAJOR: index = 8; break;
+	case IDE9_MAJOR: index = 9; break;
+	case SCSI_DISK0_MAJOR: index = 10; break;
+	case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
+		index = 11 + major - SCSI_DISK1_MAJOR;
+		break;
+        case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
+                index = 18 + major - SCSI_DISK8_MAJOR;
+                break;
+        case SCSI_CDROM_MAJOR: index = 26; break;
+        default:
+		if (!VDEV_IS_EXTENDED(vdevice))
+			index = 27;
+		else
+			index = 28;
+		break;
+	}
+
+	mi = ((major_info[index] != NULL) ? major_info[index] :
+	      xlbd_alloc_major_info(major, minor, index));
+	if (mi)
+		mi->usage++;
+	return mi;
+}
+
+static void
+xlbd_put_major_info(struct xlbd_major_info *mi)
+{
+	mi->usage--;
+	/* XXX: release major if 0 */
+}
+
+static int
+xlbd_reserve_minors(struct xlbd_major_info *mi, unsigned int minor,
+		    unsigned int nr_minors)
+{
+	struct xlbd_minor_state *ms = mi->minors;
+	unsigned int end = minor + nr_minors;
+	int rc;
+
+	if (end > ms->nr) {
+		unsigned long *bitmap, *old;
+
+		bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
+				 GFP_KERNEL);
+		if (bitmap == NULL)
+			return -ENOMEM;
+
+		spin_lock(&ms->lock);
+		if (end > ms->nr) {
+			old = ms->bitmap;
+			memcpy(bitmap, ms->bitmap,
+			       BITS_TO_LONGS(ms->nr) * sizeof(*bitmap));
+			ms->bitmap = bitmap;
+			ms->nr = BITS_TO_LONGS(end) * BITS_PER_LONG;
+		} else
+			old = bitmap;
+		spin_unlock(&ms->lock);
+		kfree(old);
+	}
+
+	spin_lock(&ms->lock);
+	if (find_next_bit(ms->bitmap, end, minor) >= end) {
+		bitmap_set(ms->bitmap, minor, nr_minors);
+		rc = 0;
+	} else
+		rc = -EBUSY;
+	spin_unlock(&ms->lock);
+
+	return rc;
+}
+
+static void
+xlbd_release_minors(struct xlbd_major_info *mi, unsigned int minor,
+		    unsigned int nr_minors)
+{
+	struct xlbd_minor_state *ms = mi->minors;
+
+	BUG_ON(minor + nr_minors > ms->nr);
+	spin_lock(&ms->lock);
+	bitmap_clear(ms->bitmap, minor, nr_minors);
+	spin_unlock(&ms->lock);
+}
+
+static int
+xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
+		     struct blkfront_info *info)
+{
+	struct request_queue *rq;
+
+	rq = blk_init_queue(do_blkif_request, &info->io_lock);
+	if (rq == NULL)
+		return -1;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
+#endif
+
+	if (info->feature_discard) {
+		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
+		blk_queue_max_discard_sectors(rq, get_capacity(gd));
+		rq->limits.discard_granularity = info->discard_granularity;
+		rq->limits.discard_alignment = info->discard_alignment;
+		if (info->feature_secdiscard)
+			queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
+	}
+
+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
+	blk_queue_logical_block_size(rq, sector_size);
+	blk_queue_max_hw_sectors(rq, 512);
+
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+	/* Ensure a merged request will fit in a single I/O ring slot. */
+	blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+	/* Make sure buffer addresses are sector-aligned. */
+	blk_queue_dma_alignment(rq, 511);
+
+	/* Make sure we don't use bounce buffers. */
+	blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
+
+	gd->queue = rq;
+	info->rq = rq;
+
+	return 0;
+}
+
+int
+xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
+	  u16 sector_size, struct blkfront_info *info)
+{
+	int major, minor;
+	struct gendisk *gd;
+	struct xlbd_major_info *mi;
+	int nr_minors = 1;
+	int err = -ENODEV;
+	unsigned int offset;
+
+	if ((vdevice>>EXT_SHIFT) > 1) {
+		/* this is above the extended range; something is wrong */
+		pr_warning("blkfront: vdevice %#x is above the extended range;"
+			   " ignoring\n", vdevice);
+		return -ENODEV;
+	}
+
+	if (!VDEV_IS_EXTENDED(vdevice)) {
+		major = BLKIF_MAJOR(vdevice);
+		minor = BLKIF_MINOR(vdevice);
+	}
+	else {
+		major = 202;
+		minor = BLKIF_MINOR_EXT(vdevice);
+	}
+
+	BUG_ON(info->gd != NULL);
+	BUG_ON(info->mi != NULL);
+	BUG_ON(info->rq != NULL);
+
+	mi = xlbd_get_major_info(major, minor, vdevice);
+	if (mi == NULL)
+		goto out;
+	info->mi = mi;
+
+	if (!(vdisk_info & VDISK_CDROM) &&
+	    (minor & ((1 << mi->type->partn_shift) - 1)) == 0)
+		nr_minors = 1 << mi->type->partn_shift;
+
+	err = xlbd_reserve_minors(mi, minor, nr_minors);
+	if (err)
+		goto out;
+	err = -ENODEV;
+
+	gd = alloc_disk(nr_minors);
+	if (gd == NULL)
+		goto release;
+
+	offset =  mi->index * mi->type->disks_per_major +
+			(minor >> mi->type->partn_shift);
+	if (nr_minors > 1 || (vdisk_info & VDISK_CDROM)) {
+		if (offset < 26) {
+			sprintf(gd->disk_name, "%s%c",
+				 mi->type->diskname, 'a' + offset );
+		}
+		else {
+			sprintf(gd->disk_name, "%s%c%c",
+				mi->type->diskname,
+				'a' + ((offset/26)-1), 'a' + (offset%26) );
+		}
+	}
+	else {
+		if (offset < 26) {
+			sprintf(gd->disk_name, "%s%c%d",
+				mi->type->diskname,
+				'a' + offset,
+				minor & ((1 << mi->type->partn_shift) - 1));
+		}
+		else {
+			sprintf(gd->disk_name, "%s%c%c%d",
+				mi->type->diskname,
+				'a' + ((offset/26)-1), 'a' + (offset%26),
+				minor & ((1 << mi->type->partn_shift) - 1));
+		}
+	}
+
+	gd->major = mi->major;
+	gd->first_minor = minor;
+	gd->fops = &xlvbd_block_fops;
+	gd->private_data = info;
+	gd->driverfs_dev = &(info->xbdev->dev);
+	set_capacity(gd, capacity);
+
+	if (xlvbd_init_blk_queue(gd, sector_size, info)) {
+		del_gendisk(gd);
+		goto release;
+	}
+
+	info->gd = gd;
+
+	xlvbd_flush(info);
+
+	if (vdisk_info & VDISK_READONLY)
+		set_disk_ro(gd, 1);
+
+	if (vdisk_info & VDISK_REMOVABLE)
+		gd->flags |= GENHD_FL_REMOVABLE;
+
+	if (vdisk_info & VDISK_CDROM)
+		gd->flags |= GENHD_FL_CD;
+
+	return 0;
+
+ release:
+	xlbd_release_minors(mi, minor, nr_minors);
+ out:
+	if (mi)
+		xlbd_put_major_info(mi);
+	info->mi = NULL;
+	return err;
+}
+
+void
+xlvbd_del(struct blkfront_info *info)
+{
+	unsigned int minor, nr_minors;
+
+	if (info->mi == NULL)
+		return;
+
+	BUG_ON(info->gd == NULL);
+	minor = info->gd->first_minor;
+	nr_minors = info->gd->minors;
+	del_gendisk(info->gd);
+	put_disk(info->gd);
+	info->gd = NULL;
+
+	xlbd_release_minors(info->mi, minor, nr_minors);
+	xlbd_put_major_info(info->mi);
+	info->mi = NULL;
+
+	BUG_ON(info->rq == NULL);
+	blk_cleanup_queue(info->rq);
+	info->rq = NULL;
+}
+
+void
+xlvbd_flush(struct blkfront_info *info)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+	blk_queue_flush(info->rq, info->feature_flush);
+	pr_info("blkfront: %s: %s: %s\n",
+		info->gd->disk_name,
+		info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+		"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
+			     "flush diskcache" : "barrier or flush"),
+		info->feature_flush ? "enabled" : "disabled");
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+	int err;
+	const char *barrier;
+
+	switch (info->feature_flush) {
+	case QUEUE_ORDERED_DRAIN:	barrier = "enabled (drain)"; break;
+	case QUEUE_ORDERED_TAG:		barrier = "enabled (tag)"; break;
+	case QUEUE_ORDERED_NONE:	barrier = "disabled"; break;
+	default:			return -EINVAL;
+	}
+
+	err = blk_queue_ordered(info->rq, info->feature_flush);
+	if (err)
+		return err;
+	pr_info("blkfront: %s: barriers %s\n",
+		info->gd->disk_name, barrier);
+#else
+	if (info->feature_flush)
+		pr_info("blkfront: %s: barriers disabled\n", info->gd->disk_name);
+#endif
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_media(struct device *dev,
+		                  struct device_attribute *attr, char *buf)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct blkfront_info *info = dev_get_drvdata(&xendev->dev);
+
+	if (info->gd->flags & GENHD_FL_CD)
+		return sprintf(buf, "cdrom\n");
+	return sprintf(buf, "disk\n");
+}
+
+static struct device_attribute xlvbd_attrs[] = {
+	__ATTR(media, S_IRUGO, show_media, NULL),
+};
+
+int xlvbd_sysfs_addif(struct blkfront_info *info)
+{
+	int i;
+	int error = 0;
+
+	for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++) {
+		error = device_create_file(info->gd->driverfs_dev,
+				&xlvbd_attrs[i]);
+		if (error)
+			goto fail;
+	}
+	return 0;
+
+fail:
+	while (--i >= 0)
+		device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]);
+	return error;
+}
+
+void xlvbd_sysfs_delif(struct blkfront_info *info)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++)
+		device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]);
+}
+
+#endif /* CONFIG_SYSFS */
diff --git a/drivers/xen/blkfront/vcd.c b/drivers/xen/blkfront/vcd.c
new file mode 100644
index 0000000..06c21c2
--- /dev/null
+++ b/drivers/xen/blkfront/vcd.c
@@ -0,0 +1,496 @@
+/*******************************************************************************
+ * vcd.c
+ *
+ * Implements CDROM cmd packet passing between frontend guest and backend driver.
+ *
+ * Copyright (c) 2008, Pat Campell  plc@novell.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/cdrom.h>
+#include <xen/interface/io/cdromif.h>
+#include "block.h"
+
+/* List of cdrom_device_info, can have as many as blkfront supports */
+struct vcd_disk {
+	struct list_head vcd_entry;
+	struct cdrom_device_info vcd_cdrom_info;
+	spinlock_t vcd_cdrom_info_lock;
+};
+static LIST_HEAD(vcd_disks);
+static DEFINE_SPINLOCK(vcd_disks_lock);
+
+static struct vcd_disk *xencdrom_get_list_entry(struct gendisk *disk)
+{
+	struct vcd_disk *ret_vcd = NULL;
+	struct vcd_disk *vcd;
+
+	spin_lock(&vcd_disks_lock);
+	list_for_each_entry(vcd, &vcd_disks, vcd_entry) {
+		if (vcd->vcd_cdrom_info.disk == disk) {
+			spin_lock(&vcd->vcd_cdrom_info_lock);
+			ret_vcd = vcd;
+			break;
+		}
+	}
+	spin_unlock(&vcd_disks_lock);
+	return ret_vcd;
+}
+
+static void submit_message(struct blkfront_info *info, void *sp)
+{
+	struct request *req = NULL;
+
+	req = blk_get_request(info->rq, READ, __GFP_WAIT);
+	if (blk_rq_map_kern(info->rq, req, sp, PAGE_SIZE, __GFP_WAIT))
+		goto out;
+
+	req->rq_disk = info->gd;
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
+	req->cmd_type = REQ_TYPE_BLOCK_PC;
+	req->cmd_flags |= REQ_NOMERGE;
+#else
+	req->flags |= REQ_BLOCK_PC;
+#endif
+	req->__sector = 0;
+	req->cmd_len = 0;
+	req->timeout = 60*HZ;
+
+	blk_execute_rq(req->q, info->gd, req, 1);
+
+out:
+	blk_put_request(req);
+}
+
+static int submit_cdrom_cmd(struct blkfront_info *info,
+			    struct packet_command *cgc)
+{
+	int ret = 0;
+	struct page *page;
+	union xen_block_packet *sp;
+	struct xen_cdrom_packet *xcp;
+	struct vcd_generic_command *vgc;
+
+	if (cgc->buffer && cgc->buflen > MAX_PACKET_DATA) {
+		pr_warn("%s() Packet buffer length is to large \n", __func__);
+		return -EIO;
+	}
+
+	page = alloc_page(GFP_NOIO|__GFP_ZERO);
+	if (!page) {
+		pr_crit("%s() Unable to allocate page\n", __func__);
+		return -ENOMEM;
+	}
+
+	sp = page_address(page);
+	xcp = &(sp->xcp);
+	xcp->type = XEN_TYPE_CDROM_PACKET;
+	xcp->payload_offset = PACKET_PAYLOAD_OFFSET;
+
+	vgc = (struct vcd_generic_command *)((char *)sp + xcp->payload_offset);
+	memcpy(vgc->cmd, cgc->cmd, CDROM_PACKET_SIZE);
+	vgc->stat = cgc->stat;
+	vgc->data_direction = cgc->data_direction;
+	vgc->quiet = cgc->quiet;
+	vgc->timeout = cgc->timeout;
+	if (cgc->sense) {
+		vgc->sense_offset = PACKET_SENSE_OFFSET;
+		memcpy((char *)sp + vgc->sense_offset, cgc->sense, sizeof(struct request_sense));
+	}
+	if (cgc->buffer) {
+		vgc->buffer_offset = PACKET_BUFFER_OFFSET;
+		memcpy((char *)sp + vgc->buffer_offset, cgc->buffer, cgc->buflen);
+		vgc->buflen = cgc->buflen;
+	}
+
+	submit_message(info,sp);
+
+	if (xcp->ret)
+		ret = xcp->err;
+
+	if (cgc->sense)
+		memcpy(cgc->sense, (char *)sp + PACKET_SENSE_OFFSET, sizeof(struct request_sense));
+	if (cgc->buffer && cgc->buflen)
+		memcpy(cgc->buffer, (char *)sp + PACKET_BUFFER_OFFSET, cgc->buflen);
+
+	__free_page(page);
+	return ret;
+}
+
+
+static int xencdrom_open(struct cdrom_device_info *cdi, int purpose)
+{
+	int ret = 0;
+	struct page *page;
+	struct blkfront_info *info;
+	union xen_block_packet *sp;
+	struct xen_cdrom_open *xco;
+
+	info = cdi->disk->private_data;
+
+	if (!info->xbdev)
+		return -ENODEV;
+
+	if (strlen(info->xbdev->otherend) > MAX_PACKET_DATA) {
+		return -EIO;
+	}
+
+	page = alloc_page(GFP_NOIO|__GFP_ZERO);
+	if (!page) {
+		pr_crit("%s() Unable to allocate page\n", __func__);
+		return -ENOMEM;
+	}
+
+	sp = page_address(page);
+	xco = &(sp->xco);
+	xco->type = XEN_TYPE_CDROM_OPEN;
+	xco->payload_offset = sizeof(struct xen_cdrom_open);
+	strcpy((char *)sp + xco->payload_offset, info->xbdev->otherend);
+
+	submit_message(info,sp);
+
+	if (xco->ret) {
+		ret = xco->err;
+		goto out;
+	}
+
+	if (xco->media_present)
+		set_capacity(cdi->disk, xco->sectors);
+
+out:
+	__free_page(page);
+	return ret;
+}
+
+static void xencdrom_release(struct cdrom_device_info *cdi)
+{
+}
+
+static int xencdrom_media_changed(struct cdrom_device_info *cdi, int disc_nr)
+{
+	int ret;
+	struct page *page;
+	struct blkfront_info *info;
+	union xen_block_packet *sp;
+	struct xen_cdrom_media_changed *xcmc;
+
+	info = cdi->disk->private_data;
+
+	page = alloc_page(GFP_NOIO|__GFP_ZERO);
+	if (!page) {
+		pr_crit("%s() Unable to allocate page\n", __func__);
+		return -ENOMEM;
+	}
+
+	sp = page_address(page);
+	xcmc = &(sp->xcmc);
+	xcmc->type = XEN_TYPE_CDROM_MEDIA_CHANGED;
+	submit_message(info,sp);
+	ret = xcmc->media_changed;
+
+	__free_page(page);
+
+	return ret;
+}
+
+static int xencdrom_tray_move(struct cdrom_device_info *cdi, int position)
+{
+	struct packet_command cgc;
+	struct blkfront_info *info;
+
+	info = cdi->disk->private_data;
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.cmd[0] = GPCMD_START_STOP_UNIT;
+	if (position)
+		cgc.cmd[4] = 2;
+	else
+		cgc.cmd[4] = 3;
+
+	return submit_cdrom_cmd(info, &cgc);
+}
+
+static int xencdrom_lock_door(struct cdrom_device_info *cdi, int lock)
+{
+	struct blkfront_info *info;
+	struct packet_command cgc;
+
+	info = cdi->disk->private_data;
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
+	cgc.cmd[4] = lock;
+
+	return submit_cdrom_cmd(info, &cgc);
+}
+
+static int xencdrom_packet(struct cdrom_device_info *cdi,
+			   struct packet_command *cgc)
+{
+	return cgc->stat = submit_cdrom_cmd(cdi->disk->private_data, cgc);
+}
+
+static int xencdrom_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
+		void *arg)
+{
+	return -EINVAL;
+}
+
+/* Query backend to see if CDROM packets are supported */
+static int xencdrom_supported(struct blkfront_info *info)
+{
+	struct page *page;
+	union xen_block_packet *sp;
+	struct xen_cdrom_support *xcs;
+
+	page = alloc_page(GFP_NOIO|__GFP_ZERO);
+	if (!page) {
+		pr_crit("%s() Unable to allocate page\n", __func__);
+		return -ENOMEM;
+	}
+
+	sp = page_address(page);
+	xcs = &(sp->xcs);
+	xcs->type = XEN_TYPE_CDROM_SUPPORT;
+	submit_message(info,sp);
+	return xcs->supported;
+}
+
+static struct cdrom_device_ops xencdrom_dops = {
+    .open           = xencdrom_open,
+    .release        = xencdrom_release,
+    .media_changed  = xencdrom_media_changed,
+    .tray_move      = xencdrom_tray_move,
+    .lock_door      = xencdrom_lock_door,
+    .generic_packet = xencdrom_packet,
+    .audio_ioctl    = xencdrom_audio_ioctl,
+    .capability     = (CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK | \
+                       CDC_MEDIA_CHANGED | CDC_GENERIC_PACKET |  CDC_DVD | \
+                       CDC_CD_R),
+    .n_minors       = 1,
+};
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+static int xencdrom_block_open(struct inode *inode, struct file *file)
+{
+	struct block_device *bd = inode->i_bdev;
+#else
+static int xencdrom_block_open(struct block_device *bd, fmode_t mode)
+{
+#endif
+	struct blkfront_info *info = bd->bd_disk->private_data;
+	struct vcd_disk *vcd;
+	int ret = 0;
+
+	if (!info->xbdev)
+		return -ENODEV;
+
+	if ((vcd = xencdrom_get_list_entry(info->gd))) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+		ret = cdrom_open(&vcd->vcd_cdrom_info, inode, file);
+#else
+		ret = cdrom_open(&vcd->vcd_cdrom_info, bd, mode);
+#endif
+		info->users = vcd->vcd_cdrom_info.use_count;
+		spin_unlock(&vcd->vcd_cdrom_info_lock);
+	}
+
+	return ret;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+static int xencdrom_block_release(struct inode *inode, struct file *file)
+{
+	struct gendisk *gd = inode->i_bdev->bd_disk;
+#else
+static int xencdrom_block_release(struct gendisk *gd, fmode_t mode)
+{
+#endif
+	struct blkfront_info *info = gd->private_data;
+	struct vcd_disk *vcd;
+	int ret = 0;
+
+	if ((vcd = xencdrom_get_list_entry(info->gd))) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+		ret = cdrom_release(&vcd->vcd_cdrom_info, file);
+#else
+		cdrom_release(&vcd->vcd_cdrom_info, mode);
+#endif
+		spin_unlock(&vcd->vcd_cdrom_info_lock);
+		if (vcd->vcd_cdrom_info.use_count == 0) {
+			info->users = 1;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+			blkif_release(inode, file);
+#else
+			blkif_release(gd, mode);
+#endif
+		}
+	}
+
+	return ret;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+static int xencdrom_block_ioctl(struct inode *inode, struct file *file,
+				unsigned cmd, unsigned long arg)
+{
+	struct block_device *bd = inode->i_bdev;
+#else
+static int xencdrom_block_ioctl(struct block_device *bd, fmode_t mode,
+				unsigned cmd, unsigned long arg)
+{
+#endif
+	struct blkfront_info *info = bd->bd_disk->private_data;
+	struct vcd_disk *vcd;
+	int ret = 0;
+
+	if (!(vcd = xencdrom_get_list_entry(info->gd)))
+		goto out;
+
+	switch (cmd) {
+	case 2285: /* SG_IO */
+		ret = -ENOSYS;
+		break;
+	case CDROMEJECT:
+		ret = xencdrom_tray_move(&vcd->vcd_cdrom_info, 1);
+		break;
+	case CDROMCLOSETRAY:
+		ret = xencdrom_tray_move(&vcd->vcd_cdrom_info, 0);
+		break;
+	case CDROM_GET_CAPABILITY:
+		ret = vcd->vcd_cdrom_info.ops->capability & ~vcd->vcd_cdrom_info.mask;
+		break;
+	case CDROM_SET_OPTIONS:
+		ret = vcd->vcd_cdrom_info.options;
+		break;
+	case CDROM_SEND_PACKET: {
+		struct packet_command cgc;
+
+		ret = copy_from_user(&cgc, (void __user *)arg, sizeof(cgc))
+		      ? -EFAULT : submit_cdrom_cmd(info, &cgc);
+		break;
+	}
+	default:
+		spin_unlock(&vcd->vcd_cdrom_info_lock);
+out:
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+		return blkif_ioctl(inode, file, cmd, arg);
+#else
+		return blkif_ioctl(bd, mode, cmd, arg);
+#endif
+	}
+	spin_unlock(&vcd->vcd_cdrom_info_lock);
+
+	return ret;
+}
+
+/* Called as result of cdrom_open, vcd_cdrom_info_lock already held */
+static int xencdrom_block_media_changed(struct gendisk *disk)
+{
+	struct vcd_disk *vcd;
+	struct vcd_disk *ret_vcd = NULL;
+
+	spin_lock(&vcd_disks_lock);
+	list_for_each_entry(vcd, &vcd_disks, vcd_entry) {
+		if (vcd->vcd_cdrom_info.disk == disk) {
+			ret_vcd = vcd;
+			break;
+		}
+	}
+	spin_unlock(&vcd_disks_lock);
+
+	return ret_vcd ? cdrom_media_changed(&ret_vcd->vcd_cdrom_info) : 0;
+}
+
+static const struct block_device_operations xencdrom_bdops =
+{
+	.owner		= THIS_MODULE,
+	.open		= xencdrom_block_open,
+	.release	= xencdrom_block_release,
+	.ioctl		= xencdrom_block_ioctl,
+	.media_changed	= xencdrom_block_media_changed,
+};
+
+void register_vcd(struct blkfront_info *info)
+{
+	struct gendisk *gd = info->gd;
+	struct vcd_disk *vcd;
+
+	/* Make sure this is for a CD device */
+	if (!(gd->flags & GENHD_FL_CD))
+		goto out;
+
+	/* Make sure we have backend support */
+	if (!xencdrom_supported(info))
+		goto out;
+
+	/* Create new vcd_disk and fill in cdrom_info */
+	vcd = kzalloc(sizeof(*vcd), GFP_KERNEL);
+	if (!vcd) {
+		pr_info("%s(): Unable to allocate vcd struct!\n", __func__);
+		goto out;
+	}
+	spin_lock_init(&vcd->vcd_cdrom_info_lock);
+
+	vcd->vcd_cdrom_info.ops = &xencdrom_dops;
+	vcd->vcd_cdrom_info.speed = 4;
+	vcd->vcd_cdrom_info.capacity = 1;
+	vcd->vcd_cdrom_info.options = 0;
+	strlcpy(vcd->vcd_cdrom_info.name, gd->disk_name,
+		ARRAY_SIZE(vcd->vcd_cdrom_info.name));
+	vcd->vcd_cdrom_info.mask = (CDC_CD_RW | CDC_DVD_R | CDC_DVD_RAM |
+			CDC_SELECT_DISC | CDC_SELECT_SPEED |
+			CDC_MRW | CDC_MRW_W | CDC_RAM);
+
+	if (register_cdrom(&(vcd->vcd_cdrom_info)) != 0) {
+		pr_warn("%s() Cannot register blkdev as a cdrom %d!\n",
+			__func__, gd->major);
+		goto err_out;
+	}
+	gd->fops = &xencdrom_bdops;
+	vcd->vcd_cdrom_info.disk = gd;
+
+	spin_lock(&vcd_disks_lock);
+	list_add(&(vcd->vcd_entry), &vcd_disks);
+	spin_unlock(&vcd_disks_lock);
+out:
+	return;
+err_out:
+	kfree(vcd);
+}
+
+void unregister_vcd(struct blkfront_info *info) {
+	struct gendisk *gd = info->gd;
+	struct vcd_disk *vcd;
+
+	spin_lock(&vcd_disks_lock);
+	list_for_each_entry(vcd, &vcd_disks, vcd_entry) {
+		if (vcd->vcd_cdrom_info.disk == gd) {
+			spin_lock(&vcd->vcd_cdrom_info_lock);
+			unregister_cdrom(&vcd->vcd_cdrom_info);
+			list_del(&vcd->vcd_entry);
+			spin_unlock(&vcd->vcd_cdrom_info_lock);
+			kfree(vcd);
+			break;
+		}
+	}
+	spin_unlock(&vcd_disks_lock);
+}
diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile
new file mode 100644
index 0000000..b1e4a07
--- /dev/null
+++ b/drivers/xen/blktap/Makefile
@@ -0,0 +1,5 @@
+LINUXINCLUDE += -I../xen/include/public/io
+
+obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
+
+blktap-y := xenbus.o interface.o blocktap.o
diff --git a/drivers/xen/blktap/blktap.c b/drivers/xen/blktap/blktap.c
new file mode 100644
index 0000000..eafd5db
--- /dev/null
+++ b/drivers/xen/blktap/blktap.c
@@ -0,0 +1,1784 @@
+/******************************************************************************
+ * drivers/xen/blktap/blktap.c
+ * 
+ * Back-end driver for user level virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. Requests
+ * are remapped to a user-space memory region.
+ *
+ * Based on the blkback driver code.
+ * 
+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
+ *
+ * Clean ups and fix ups:
+ *    Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <asm/hypervisor.h>
+#include "common.h"
+#include <xen/balloon.h>
+#include <xen/driver_util.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/gfp.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/nsproxy.h>
+#include <asm/tlbflush.h>
+
+#define MAX_TAP_DEV 256     /*the maximum number of tapdisk ring devices    */
+#define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by 
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE		__CONST_RING_SIZE(blkif, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM		BLK_RING_SIZE
+#define MAX_PENDING_REQS	BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg)                                   \
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+static int mmap_pages = MMAP_PAGES;
+
+#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
+		      * have a bunch of pages reserved for shared
+		      * memory rings.
+		      */
+
+/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
+typedef struct domid_translate {
+	unsigned short domid;
+	unsigned short busid;
+} domid_translate_t ;
+
+typedef struct domid_translate_ext {
+	unsigned short domid;
+	u32 busid;
+} domid_translate_ext_t ;
+
+/*Data struct associated with each of the tapdisk devices*/
+typedef struct tap_blkif {
+	struct mm_struct *mm;         /*User address space                   */
+	unsigned long rings_vstart;   /*Kernel memory mapping                */
+	unsigned long user_vstart;    /*User memory mapping                  */
+	unsigned long dev_inuse;      /*One process opens device at a time.  */
+	unsigned long dev_pending;    /*In process of being opened           */
+	unsigned long ring_ok;        /*make this ring->state                */
+	blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
+	wait_queue_head_t wait;       /*for poll                             */
+	unsigned long mode;           /*current switching mode               */
+	int minor;                    /*Minor number for tapdisk device      */
+	pid_t pid;                    /*tapdisk process id                   */
+	struct pid_namespace *pid_ns; /*... and its corresponding namespace  */
+	enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
+						  shutdown                   */
+	spinlock_t map_lock;          /*protects idx_map                     */
+	struct idx_map {
+		u16 mem, req;
+	} *idx_map;                   /*Record the user ring id to kern
+					[req id, idx] tuple                  */
+	blkif_t *blkif;               /*Associate blkif with tapdev          */
+	struct domid_translate_ext trans; /*Translation from domid to bus.   */
+	struct vm_foreign_map foreign_map;    /*Mapping page */
+} tap_blkif_t;
+
+static struct tap_blkif *tapfds[MAX_TAP_DEV];
+static int blktap_next_minor;
+
+/* Run-time switchable: /sys/module/blktap/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it.
+ */
+typedef struct {
+	blkif_t       *blkif;
+	u64            id;
+	unsigned short mem_idx;
+	unsigned short nr_pages;
+	struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs[MAX_PENDING_REQS];
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
+static int alloc_pending_reqs;
+
+static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
+	return (req - pending_reqs[idx]);
+}
+
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static struct page **foreign_pages[MAX_DYNAMIC_MEM];
+static inline struct page *idx_to_page(
+	unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
+{
+	unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
+	return foreign_pages[mmap_idx][arr_idx];
+}
+static inline unsigned long idx_to_kaddr(
+	unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
+{
+	unsigned long pfn = page_to_pfn(idx_to_page(mmap_idx,req_idx,sg_idx));
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+static unsigned short mmap_alloc = 0;
+static unsigned short mmap_lock = 0;
+static unsigned short mmap_inuse = 0;
+
+/******************************************************************
+ * GRANT HANDLES
+ */
+
+/* When using grant tables to map a frame for device access then the
+ * handle returned must be used to unmap the frame. This is needed to
+ * drop the ref count on the frame.
+ */
+struct grant_handle_pair
+{
+        grant_handle_t kernel;
+        grant_handle_t user;
+};
+#define INVALID_GRANT_HANDLE	0xFFFF
+
+static struct grant_handle_pair 
+    pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
+#define pending_handle(_id, _idx, _i) \
+    (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
+    + (_i)])
+
+
+static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
+
+#define BLKTAP_MINOR 0  /*/dev/xen/blktap has a dynamic major */
+#define BLKTAP_DEV_DIR  "/dev/xen"
+
+static int blktap_major;
+
+/* blktap IOCTLs: */
+#define BLKTAP_IOCTL_KICK_FE         1
+#define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
+#define BLKTAP_IOCTL_SETMODE         3
+#define BLKTAP_IOCTL_SENDPID	     4
+#define BLKTAP_IOCTL_NEWINTF	     5
+#define BLKTAP_IOCTL_MINOR	     6
+#define BLKTAP_IOCTL_MAJOR	     7
+#define BLKTAP_QUERY_ALLOC_REQS      8
+#define BLKTAP_IOCTL_FREEINTF        9
+#define BLKTAP_IOCTL_NEWINTF_EXT     50
+#define BLKTAP_IOCTL_PRINT_IDXS      100  
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
+#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
+#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
+
+#define BLKTAP_MODE_INTERPOSE \
+           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+	return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
+		(arg == BLKTAP_MODE_INTERCEPT_FE) ||
+                (arg == BLKTAP_MODE_INTERPOSE   ));
+}
+
+/* Requests passing through the tap to userspace are re-assigned an ID.
+ * We must record a mapping between the BE [IDX,ID] tuple and the userspace
+ * ring ID. 
+ */
+
+#define INVALID_MIDX 0xdead
+
+/*TODO: Convert to a free list*/
+static inline unsigned int GET_NEXT_REQ(const struct idx_map *idx_map)
+{
+	unsigned int i;
+
+	for (i = 0; i < MAX_PENDING_REQS; i++)
+		if (idx_map[i].mem == INVALID_MIDX)
+			break;
+
+	return i;
+}
+
+static inline unsigned int OFFSET_TO_USR_IDX(unsigned long offset)
+{
+	return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
+}
+
+static inline unsigned int OFFSET_TO_SEG(unsigned long offset)
+{
+	return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
+}
+
+
+#define BLKTAP_INVALID_HANDLE(_g) \
+    (((_g->kernel) == INVALID_GRANT_HANDLE) &&  \
+     ((_g->user) == INVALID_GRANT_HANDLE))
+
+#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
+    (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
+    } while(0)
+
+
+static char *blktap_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "xen/blktap%u", MINOR(dev->devt));
+}
+
+static struct device_type blktap_type = {
+	.devnode = blktap_devnode
+};
+
+/******************************************************************
+ * BLKTAP VM OPS
+ */
+
+static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	/*
+	 * if the page has not been mapped in by the driver then return
+	 * VM_FAULT_SIGBUS to the domain.
+	 */
+
+	return VM_FAULT_SIGBUS;
+}
+
+static pte_t blktap_clear_pte(struct vm_area_struct *vma,
+			      unsigned long uvaddr,
+			      pte_t *ptep, int is_fullmm)
+{
+	pte_t copy;
+	tap_blkif_t *info = NULL;
+	unsigned int seg, usr_idx, pending_idx, mmap_idx, count = 0;
+	unsigned long offset;
+	struct page *pg;
+	struct grant_handle_pair *khandle;
+	struct gnttab_unmap_grant_ref unmap[2];
+
+	/*
+	 * If the address is before the start of the grant mapped region or
+	 * if vm_file is NULL (meaning mmap failed and we have nothing to do)
+	 */
+	if (vma->vm_file != NULL)
+		info = vma->vm_file->private_data;
+	if (info == NULL || uvaddr < info->user_vstart)
+		return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+						   is_fullmm);
+
+	offset = (uvaddr - info->user_vstart) >> PAGE_SHIFT;
+	usr_idx = OFFSET_TO_USR_IDX(offset);
+	seg = OFFSET_TO_SEG(offset);
+
+	spin_lock(&info->map_lock);
+
+	pending_idx = info->idx_map[usr_idx].req;
+	mmap_idx = info->idx_map[usr_idx].mem;
+
+	/* fast_flush_area() may already have cleared this entry */
+	if (mmap_idx == INVALID_MIDX) {
+		spin_unlock(&info->map_lock);
+		return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+						   is_fullmm);
+	}
+
+	pg = idx_to_page(mmap_idx, pending_idx, seg);
+	ClearPageReserved(pg);
+	info->foreign_map.map[offset + RING_PAGES] = NULL;
+
+	khandle = &pending_handle(mmap_idx, pending_idx, seg);
+
+	if (khandle->kernel != INVALID_GRANT_HANDLE) {
+		unsigned long pfn = page_to_pfn(pg);
+
+		gnttab_set_unmap_op(&unmap[count],
+				    (unsigned long)pfn_to_kaddr(pfn),
+				    GNTMAP_host_map, khandle->kernel);
+		count++;
+
+		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+	}
+
+	if (khandle->user != INVALID_GRANT_HANDLE) {
+		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+
+		copy = *ptep;
+		gnttab_set_unmap_op(&unmap[count], ptep_to_machine(ptep),
+				    GNTMAP_host_map 
+				    | GNTMAP_application_map 
+				    | GNTMAP_contains_pte,
+				    khandle->user);
+		count++;
+	} else {
+		BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
+
+		/* USING SHADOW PAGE TABLES. */
+		copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+						   is_fullmm);
+	}
+
+	if (count) {
+		BLKTAP_INVALIDATE_HANDLE(khandle);
+		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+					      unmap, count))
+			BUG();
+	}
+
+	spin_unlock(&info->map_lock);
+
+	return copy;
+}
+
+static void blktap_vma_open(struct vm_area_struct *vma)
+{
+	tap_blkif_t *info;
+	if (vma->vm_file == NULL)
+		return;
+
+	info = vma->vm_file->private_data;
+	vma->vm_private_data =
+		&info->foreign_map.map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT];
+}
+
+/* tricky part
+ * When partial munmapping, ->open() is called only splitted vma which
+ * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c
+ * So there is no chance to fix up vm_private_data of the end vma.
+ */
+static void blktap_vma_close(struct vm_area_struct *vma)
+{
+	tap_blkif_t *info;
+	struct vm_area_struct *next = vma->vm_next;
+
+	if (next == NULL ||
+	    vma->vm_ops != next->vm_ops ||
+	    vma->vm_end != next->vm_start ||
+	    vma->vm_file == NULL ||
+	    vma->vm_file != next->vm_file)
+		return;
+
+	info = vma->vm_file->private_data;
+	next->vm_private_data =
+		&info->foreign_map.map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT];
+}
+
+static struct vm_operations_struct blktap_vm_ops = {
+	fault:    blktap_fault,
+	zap_pte:  blktap_clear_pte,
+	open:     blktap_vma_open,
+	close:    blktap_vma_close,
+};
+
+/******************************************************************
+ * BLKTAP FILE OPS
+ */
+ 
+/*Function Declarations*/
+static tap_blkif_t *get_next_free_dev(void);
+static int blktap_open(struct inode *inode, struct file *filp);
+static int blktap_release(struct inode *inode, struct file *filp);
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
+static long blktap_ioctl(struct file *filp, unsigned int cmd,
+			 unsigned long arg);
+static unsigned int blktap_poll(struct file *file, poll_table *wait);
+
+static const struct file_operations blktap_fops = {
+	.owner   = THIS_MODULE,
+	.poll    = blktap_poll,
+	.unlocked_ioctl = blktap_ioctl,
+	.open    = blktap_open,
+	.release = blktap_release,
+	.llseek  = no_llseek,
+	.mmap    = blktap_mmap,
+};
+
+
+static tap_blkif_t *get_next_free_dev(void)
+{
+	tap_blkif_t *info;
+	int minor;
+
+	/*
+	 * This is called only from the ioctl, which
+	 * means we should always have interrupts enabled.
+	 */
+	BUG_ON(irqs_disabled());
+
+	spin_lock_irq(&pending_free_lock);
+
+	/* tapfds[0] is always NULL */
+
+	for (minor = 1; minor < blktap_next_minor; minor++) {
+		info = tapfds[minor];
+		/* we could have failed a previous attempt. */
+		if (!info ||
+		    ((!test_bit(0, &info->dev_inuse)) &&
+		     (info->dev_pending == 0)) ) {
+			info->dev_pending = 1;
+			goto found;
+		}
+	}
+	info = NULL;
+	minor = -1;
+
+	/*
+	 * We didn't find free device. If we can still allocate
+	 * more, then we grab the next device minor that is
+	 * available.  This is done while we are still under
+	 * the protection of the pending_free_lock.
+	 */
+	if (blktap_next_minor < MAX_TAP_DEV)
+		minor = blktap_next_minor++;
+found:
+	spin_unlock_irq(&pending_free_lock);
+
+	if (!info && minor > 0) {
+		info = kzalloc(sizeof(*info), GFP_KERNEL);
+		if (unlikely(!info)) {
+			/*
+			 * If we failed here, try to put back
+			 * the next minor number. But if one
+			 * was just taken, then we just lose this
+			 * minor.  We can try to allocate this
+			 * minor again later.
+			 */
+			spin_lock_irq(&pending_free_lock);
+			if (blktap_next_minor == minor+1)
+				blktap_next_minor--;
+			spin_unlock_irq(&pending_free_lock);
+			goto out;
+		}
+
+		info->minor = minor;
+		spin_lock_init(&info->map_lock);
+		/*
+		 * Make sure that we have a minor before others can
+		 * see us.
+		 */
+		wmb();
+		tapfds[minor] = info;
+
+		xen_class_device_create(&blktap_type, NULL,
+					MKDEV(blktap_major, minor),
+					NULL, "blktap%d", minor);
+	}
+
+out:
+	return info;
+}
+
+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
+{
+	tap_blkif_t *info;
+	int i;
+
+	for (i = 1; i < blktap_next_minor; i++) {
+		info = tapfds[i];
+		if ( info &&
+		     (info->trans.domid == domid) &&
+		     (info->trans.busid == xenbus_id) ) {
+			info->blkif = blkif;
+			info->status = RUNNING;
+			return i;
+		}
+	}
+	return -1;
+}
+
+void signal_tapdisk(int idx) 
+{
+	tap_blkif_t *info;
+	struct task_struct *ptask;
+	struct mm_struct *mm;
+
+	/*
+	 * if the userland tools set things up wrong, this could be negative;
+	 * just don't try to signal in this case
+	 */
+	if (idx < 0 || idx >= MAX_TAP_DEV)
+		return;
+
+	info = tapfds[idx];
+	if (!info)
+		return;
+
+	if (info->pid > 0) {
+		ptask = pid_task(find_pid_ns(info->pid, info->pid_ns),
+				 PIDTYPE_PID);
+		if (ptask)
+			info->status = CLEANSHUTDOWN;
+	}
+	info->blkif = NULL;
+
+	mm = xchg(&info->mm, NULL);
+	if (mm)
+		mmput(mm);
+}
+
+static int blktap_open(struct inode *inode, struct file *filp)
+{
+	blkif_sring_t *sring;
+	int idx = iminor(inode) - BLKTAP_MINOR;
+	tap_blkif_t *info;
+	int i;
+	
+	nonseekable_open(inode, filp);
+
+	/* ctrl device, treat differently */
+	if (!idx)
+		return 0;
+	if (idx < 0 || idx >= MAX_TAP_DEV) {
+		WPRINTK("No device /dev/xen/blktap%d\n", idx);
+		return -ENODEV;
+	}
+
+	info = tapfds[idx];
+	if (!info) {
+		WPRINTK("Unable to open device /dev/xen/blktap%d\n",
+			idx);
+		return -ENODEV;
+	}
+
+	DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
+	
+	/*Only one process can access device at a time*/
+	if (test_and_set_bit(0, &info->dev_inuse))
+		return -EBUSY;
+
+	info->dev_pending = 0;
+	    
+	/* Allocate the fe ring. */
+	sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+	if (sring == NULL)
+		goto fail_nomem;
+
+	SetPageReserved(virt_to_page(sring));
+    
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
+	
+	filp->private_data = info;
+	info->mm = NULL;
+
+	info->idx_map = kmalloc(sizeof(*info->idx_map) * MAX_PENDING_REQS,
+				GFP_KERNEL);
+	
+	if (info->idx_map == NULL)
+		goto fail_nomem;
+
+	if (idx > 0) {
+		init_waitqueue_head(&info->wait);
+		for (i = 0; i < MAX_PENDING_REQS; i++) {
+			info->idx_map[i].mem = INVALID_MIDX;
+			info->idx_map[i].req = ~0;
+		}
+	}
+
+	DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
+	return 0;
+
+ fail_nomem:
+	return -ENOMEM;
+}
+
+static int blktap_release(struct inode *inode, struct file *filp)
+{
+	tap_blkif_t *info = filp->private_data;
+	struct mm_struct *mm;
+	
+	/* check for control device */
+	if (!info)
+		return 0;
+
+	info->ring_ok = 0;
+	smp_wmb();
+	info->rings_vstart = 0;
+
+	mm = xchg(&info->mm, NULL);
+	if (mm)
+		mmput(mm);
+	kfree(info->foreign_map.map);
+	info->foreign_map.map = NULL;
+
+	/* Free the ring page. */
+	ClearPageReserved(virt_to_page(info->ufe_ring.sring));
+	free_page((unsigned long) info->ufe_ring.sring);
+
+	if (info->idx_map) {
+		kfree(info->idx_map);
+		info->idx_map = NULL;
+	}
+
+	if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
+		if (info->blkif->xenblkd != NULL) {
+			kthread_stop(info->blkif->xenblkd);
+			info->blkif->xenblkd = NULL;
+		}
+		info->status = CLEANSHUTDOWN;
+	}
+
+	clear_bit(0, &info->dev_inuse);
+	DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
+
+	return 0;
+}
+
+
+/* Note on mmap:
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them.  This couldn't be done before, because
+ * there isn't really a sane way to translate a user virtual address down to a 
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space.  This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
+ * from pages to actual page structs.  There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ */
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int size;
+	tap_blkif_t *info = filp->private_data;
+	int ret;
+
+	if (info == NULL) {
+		WPRINTK("mmap: no private data?\n");
+		return -ENOMEM;
+	}
+
+	if (info->rings_vstart) {
+		WPRINTK("mmap already called on filp %p (minor %d)\n",
+			filp, info->minor);
+		return -EPERM;
+	}
+
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &blktap_vm_ops;
+
+	size = vma->vm_end - vma->vm_start;
+	if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
+		WPRINTK("you _must_ map exactly %d pages!\n",
+		       mmap_pages + RING_PAGES);
+		return -EAGAIN;
+	}
+
+	size >>= PAGE_SHIFT;
+	info->rings_vstart = vma->vm_start;
+	info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
+    
+	/* Map the ring pages to the start of the region and reserve it. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		ret = vm_insert_page(vma, vma->vm_start,
+				     virt_to_page(info->ufe_ring.sring));
+	else
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
+				      PAGE_SIZE, vma->vm_page_prot);
+	if (ret) {
+		WPRINTK("Mapping user ring failed!\n");
+		goto fail;
+	}
+
+	/* Mark this VM as containing foreign pages, and set up mappings. */
+	info->foreign_map.map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) *
+			    sizeof(*info->foreign_map.map), GFP_KERNEL);
+	if (info->foreign_map.map == NULL) {
+		WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
+		goto fail;
+	}
+
+	vma->vm_private_data = &info->foreign_map;
+	vma->vm_flags |= VM_FOREIGN;
+	vma->vm_flags |= VM_DONTCOPY;
+
+#ifdef CONFIG_X86
+	vma->vm_mm->context.has_foreign_mappings = 1;
+#endif
+
+	info->mm = get_task_mm(current);
+	smp_wmb();
+	info->ring_ok = 1;
+	return 0;
+ fail:
+	/* Clear any active mappings. */
+	zap_page_range(vma, vma->vm_start, 
+		       vma->vm_end - vma->vm_start, NULL);
+	info->rings_vstart = 0;
+
+	return -ENOMEM;
+}
+
+
+static long blktap_ioctl(struct file *filp, unsigned int cmd,
+			 unsigned long arg)
+{
+	tap_blkif_t *info = filp->private_data;
+
+	switch(cmd) {
+	case BLKTAP_IOCTL_KICK_FE: 
+	{
+		/* There are fe messages to process. */
+		return blktap_read_ufe_ring(info);
+	}
+	case BLKTAP_IOCTL_SETMODE:
+	{
+		if (info) {
+			if (BLKTAP_MODE_VALID(arg)) {
+				info->mode = arg;
+				/* XXX: may need to flush rings here. */
+				DPRINTK("set mode to %lx\n", arg);
+				return 0;
+			}
+		}
+		return 0;
+	}
+	case BLKTAP_IOCTL_PRINT_IDXS:
+        {
+		if (info) {
+			pr_info("User Rings: \n-----------\n");
+			pr_info("UF: rsp_cons: %2d, req_prod_prv: %2d "
+				"| req_prod: %2d, rsp_prod: %2d\n",
+				info->ufe_ring.rsp_cons,
+				info->ufe_ring.req_prod_pvt,
+				info->ufe_ring.sring->req_prod,
+				info->ufe_ring.sring->rsp_prod);
+		}
+            	return 0;
+        }
+	case BLKTAP_IOCTL_SENDPID:
+	{
+		if (info) {
+			info->pid = (pid_t)arg;
+			info->pid_ns = current->nsproxy->pid_ns;
+			DPRINTK("pid received %p:%d\n",
+			        info->pid_ns, info->pid);
+		}
+		return 0;
+	}
+	case BLKTAP_IOCTL_NEWINTF:
+	{		
+		uint64_t val = (uint64_t)arg;
+		domid_translate_t *tr = (domid_translate_t *)&val;
+
+		DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
+		       tr->domid, tr->busid);
+		info = get_next_free_dev();
+		if (!info) {
+			WPRINTK("Error initialising /dev/xen/blktap - "
+				"No more devices\n");
+			return -1;
+		}
+		info->trans.domid = tr->domid;
+		info->trans.busid = tr->busid;
+		return info->minor;
+	}
+	case BLKTAP_IOCTL_NEWINTF_EXT:
+	{
+		void __user *udata = (void __user *) arg;
+		domid_translate_ext_t tr;
+
+		if (copy_from_user(&tr, udata, sizeof(domid_translate_ext_t)))
+			return -EFAULT;
+
+		DPRINTK("NEWINTF_EXT Req for domid %d and bus id %d\n", 
+		       tr.domid, tr.busid);
+		info = get_next_free_dev();
+		if (!info) {
+			WPRINTK("Error initialising /dev/xen/blktap - "
+				"No more devices\n");
+			return -1;
+		}
+		info->trans.domid = tr.domid;
+		info->trans.busid = tr.busid;
+		return info->minor;
+	}
+	case BLKTAP_IOCTL_FREEINTF:
+	{
+		unsigned long dev = arg;
+		unsigned long flags;
+
+		if (info || dev >= MAX_TAP_DEV)
+			return -EINVAL;
+
+		info = tapfds[dev];
+		if (!info)
+			return 0; /* should this be an error? */
+
+		spin_lock_irqsave(&pending_free_lock, flags);
+		if (info->dev_pending)
+			info->dev_pending = 0;
+		spin_unlock_irqrestore(&pending_free_lock, flags);
+
+		return 0;
+	}
+	case BLKTAP_IOCTL_MINOR:
+		if (!info) {
+			unsigned long dev = arg;
+
+			if (dev >= MAX_TAP_DEV)
+				return -EINVAL;
+
+			info = tapfds[dev];
+			if (!info)
+				return -EINVAL;
+		}
+
+		return info->minor;
+
+	case BLKTAP_IOCTL_MAJOR:
+		return blktap_major;
+
+	case BLKTAP_QUERY_ALLOC_REQS:
+		WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%lu\n",
+			alloc_pending_reqs, MAX_PENDING_REQS);
+		return (alloc_pending_reqs/MAX_PENDING_REQS) * 100;
+	}
+	return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_poll(struct file *filp, poll_table *wait)
+{
+	tap_blkif_t *info = filp->private_data;
+	
+	/* do not work on the control device */
+	if (!info)
+		return 0;
+
+	poll_wait(filp, &info->wait, wait);
+	if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
+		RING_PUSH_REQUESTS(&info->ufe_ring);
+		return POLLIN | POLLRDNORM;
+	}
+	return 0;
+}
+
+static void blktap_kick_user(int idx)
+{
+	tap_blkif_t *info;
+
+	if (idx < 0 || idx >= MAX_TAP_DEV)
+		return;
+
+	info = tapfds[idx];
+	if (!info)
+		return;
+
+	wake_up_interruptible(&info->wait);
+
+	return;
+}
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+                          unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static int req_increase(void)
+{
+	int i, j;
+
+	if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
+		return -EINVAL;
+
+	pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t)
+					    * MAX_PENDING_REQS, GFP_KERNEL);
+	foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
+
+	if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
+		goto out_of_memory;
+
+	DPRINTK("reqs=%lu, pages=%d\n", MAX_PENDING_REQS, mmap_pages);
+
+	for (i = 0; i < MAX_PENDING_REQS; i++) {
+		list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
+			      &pending_free);
+		pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
+		for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
+			BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
+								 i, j));
+	}
+
+	mmap_alloc++;
+	DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
+	return 0;
+
+ out_of_memory:
+	free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
+	kfree(pending_reqs[mmap_alloc]);
+	WPRINTK("%s: out of memory\n", __FUNCTION__);
+	return -ENOMEM;
+}
+
+static void mmap_req_del(int mmap)
+{
+	assert_spin_locked(&pending_free_lock);
+
+	kfree(pending_reqs[mmap]);
+	pending_reqs[mmap] = NULL;
+
+	free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
+	foreign_pages[mmap] = NULL;
+
+	mmap_lock = 0;
+	DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
+	mmap_alloc--;
+}
+
+static pending_req_t* alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+
+	if (req)
+		alloc_pending_reqs++;
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+
+	alloc_pending_reqs--;
+	if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
+		mmap_inuse--;
+		if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
+		spin_unlock_irqrestore(&pending_free_lock, flags);
+		return;
+	}
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+static void blktap_zap_page_range(struct mm_struct *mm,
+				  unsigned long uvaddr, int nr_pages)
+{
+	unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT);
+	struct vm_area_struct *vma;
+
+	vma = find_vma(mm, uvaddr);
+	while (vma && uvaddr < end) {
+		unsigned long s = max(uvaddr, vma->vm_start);
+		unsigned long e = min(end, vma->vm_end);
+
+		zap_page_range(vma, s, e - s, NULL);
+
+		uvaddr = e;
+		vma = vma->vm_next;
+	}
+}
+
+static void fast_flush_area(pending_req_t *req, unsigned int k_idx,
+			     unsigned int u_idx, tap_blkif_t *info)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+	unsigned int i, mmap_idx, invcount = 0;
+	struct grant_handle_pair *khandle;
+	uint64_t ptep;
+	int ret;
+	unsigned long uvaddr;
+	struct mm_struct *mm = info->mm;
+
+	if (mm != NULL)
+		down_read(&mm->mmap_sem);
+
+	if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) {
+ slow:
+		blktap_zap_page_range(mm,
+				      MMAP_VADDR(info->user_vstart, u_idx, 0),
+				      req->nr_pages);
+		info->idx_map[u_idx].mem = INVALID_MIDX;
+		up_read(&mm->mmap_sem);
+		return;
+	}
+
+	mmap_idx = req->mem_idx;
+
+	spin_lock(&info->map_lock);
+
+	for (i = 0; i < req->nr_pages; i++) {
+		uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
+
+		khandle = &pending_handle(mmap_idx, k_idx, i);
+
+		if (khandle->kernel != INVALID_GRANT_HANDLE) {
+			gnttab_set_unmap_op(&unmap[invcount],
+					    idx_to_kaddr(mmap_idx, k_idx, i),
+					    GNTMAP_host_map, khandle->kernel);
+			invcount++;
+
+			set_phys_to_machine(
+				page_to_pfn(idx_to_page(mmap_idx, k_idx, i)),
+				INVALID_P2M_ENTRY);
+		}
+
+		if (mm != NULL && khandle->user != INVALID_GRANT_HANDLE) {
+			BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+			if (create_lookup_pte_addr(
+				mm,
+				MMAP_VADDR(info->user_vstart, u_idx, i),
+				&ptep) !=0) {
+				spin_unlock(&info->map_lock);
+				WPRINTK("Couldn't get a pte addr!\n");
+				goto slow;
+			}
+
+			gnttab_set_unmap_op(&unmap[invcount], ptep,
+					    GNTMAP_host_map
+					    | GNTMAP_application_map
+					    | GNTMAP_contains_pte,
+					    khandle->user);
+			invcount++;
+		}
+
+		BLKTAP_INVALIDATE_HANDLE(khandle);
+	}
+	ret = HYPERVISOR_grant_table_op(
+		GNTTABOP_unmap_grant_ref, unmap, invcount);
+	BUG_ON(ret);
+	
+	info->idx_map[u_idx].mem = INVALID_MIDX;
+
+	spin_unlock(&info->map_lock);
+	if (mm != NULL)
+		up_read(&mm->mmap_sem);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d |  pk %4d\n",
+	       current->comm, blkif->st_oo_req,
+	       blkif->st_rd_req, blkif->st_wr_req, blkif->st_pk_req);
+	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	blkif->st_rd_req = 0;
+	blkif->st_wr_req = 0;
+	blkif->st_oo_req = 0;
+	blkif->st_pk_req = 0;
+}
+
+int tap_blkif_schedule(void *arg)
+{
+	blkif_t *blkif = arg;
+	tap_blkif_t *info;
+
+	blkif_get(blkif);
+
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: started\n", current->comm);
+
+	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+
+		wait_event_interruptible(
+			blkif->wq,
+			blkif->waiting_reqs || kthread_should_stop());
+		wait_event_interruptible(
+			pending_free_wq,
+			!list_empty(&pending_free) || kthread_should_stop());
+
+		blkif->waiting_reqs = 0;
+		smp_mb(); /* clear flag *before* checking for work */
+
+		if (do_block_io_op(blkif))
+			blkif->waiting_reqs = 1;
+
+		if (log_stats && time_after(jiffies, blkif->st_print))
+			print_stats(blkif);
+	}
+
+	if (log_stats)
+		print_stats(blkif);
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+	blkif->xenblkd = NULL;
+	info = tapfds[blkif->dev_num];
+	blkif_put(blkif);
+
+	if (info) {
+		struct mm_struct *mm = xchg(&info->mm, NULL);
+
+		if (mm)
+			mmput(mm);
+	}
+
+	return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called by user level ioctl()
+ */
+
+static int blktap_read_ufe_ring(tap_blkif_t *info)
+{
+	/* This is called to read responses from the UFE ring. */
+	RING_IDX i, j, rp;
+	blkif_response_t *resp;
+	blkif_t *blkif=NULL;
+	unsigned int pending_idx, usr_idx, mmap_idx;
+	pending_req_t *pending_req;
+	
+	if (!info)
+		return 0;
+
+	/* We currently only forward packets in INTERCEPT_FE mode. */
+	if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
+		return 0;
+
+	/* for each outstanding message on the UFEring  */
+	rp = info->ufe_ring.sring->rsp_prod;
+	rmb();
+        
+	for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
+		blkif_response_t res;
+		resp = RING_GET_RESPONSE(&info->ufe_ring, i);
+		memcpy(&res, resp, sizeof(res));
+		mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
+		++info->ufe_ring.rsp_cons;
+
+		/*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
+		if (res.id >= MAX_PENDING_REQS) {
+			WPRINTK("incorrect req map [%llx]\n",
+				(unsigned long long)res.id);
+			continue;
+		}
+
+		usr_idx = (unsigned int)res.id;
+		pending_idx = info->idx_map[usr_idx].req;
+		mmap_idx = info->idx_map[usr_idx].mem;
+
+		if (mmap_idx >= mmap_alloc ||
+		    pending_idx >= MAX_PENDING_REQS) {
+			WPRINTK("incorrect req map [%d],"
+				" internal map [%d,%d]\n",
+				usr_idx, mmap_idx, pending_idx);
+			continue;
+		}
+
+		pending_req = &pending_reqs[mmap_idx][pending_idx];
+		blkif = pending_req->blkif;
+
+		for (j = 0; j < pending_req->nr_pages; j++) {
+
+			unsigned long uvaddr;
+			struct page *pg;
+			int offset;
+
+			uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
+
+			pg = idx_to_page(mmap_idx, pending_idx, j);
+			ClearPageReserved(pg);
+			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
+			info->foreign_map.map[offset] = NULL;
+		}
+		fast_flush_area(pending_req, pending_idx, usr_idx, info);
+		make_response(blkif, pending_req->id, res.operation,
+			      res.status);
+		blkif_put(pending_req->blkif);
+		free_req(pending_req);
+	}
+		
+	return 0;
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+	blkif->waiting_reqs = 1;
+	wake_up(&blkif->wq);
+}
+
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
+{
+	blkif_notify_work(dev_id);
+	return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+static int print_dbug = 1;
+static int do_block_io_op(blkif_t *blkif)
+{
+	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+	blkif_request_t req;
+	pending_req_t *pending_req;
+	RING_IDX rc, rp;
+	int more_to_do = 0;
+	tap_blkif_t *info;
+
+	rc = blk_rings->common.req_cons;
+	rp = blk_rings->common.sring->req_prod;
+	rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+	/*Check blkif has corresponding UE ring*/
+	if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV) {
+		/*oops*/
+		if (print_dbug) {
+			WPRINTK("Corresponding UE " 
+			       "ring does not exist!\n");
+			print_dbug = 0; /*We only print this message once*/
+		}
+		return 0;
+	}
+
+	info = tapfds[blkif->dev_num];
+
+	if (!info || !test_bit(0, &info->dev_inuse)) {
+		if (print_dbug) {
+			WPRINTK("Can't get UE info!\n");
+			print_dbug = 0;
+		}
+		return 0;
+	}
+
+	while (rc != rp) {
+		
+		if (RING_FULL(&info->ufe_ring)) {
+			WPRINTK("RING_FULL! More to do\n");
+			more_to_do = 1;
+			break;
+		}
+
+		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
+			WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
+			       " More to do\n");
+			more_to_do = 1;
+			break;		
+		}
+
+		if (kthread_should_stop()) {
+			more_to_do = 1;
+			break;
+		}
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
+			more_to_do = 1;
+			break;
+		}
+
+		switch (blkif->blk_protocol) {
+		case BLKIF_PROTOCOL_NATIVE:
+			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
+			       sizeof(req));
+			break;
+		case BLKIF_PROTOCOL_X86_32:
+			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+			break;
+		case BLKIF_PROTOCOL_X86_64:
+			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+			break;
+		default:
+			BUG();
+		}
+		blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+		/* Apply all sanity checks to /private copy/ of request. */
+		barrier();
+
+		switch (req.operation) {
+		case BLKIF_OP_READ:
+			blkif->st_rd_req++;
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+
+		case BLKIF_OP_WRITE_BARRIER:
+			/* TODO Some counter? */
+			/* Fall through */
+		case BLKIF_OP_WRITE:
+			blkif->st_wr_req++;
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+
+		case BLKIF_OP_PACKET:
+			blkif->st_pk_req++;
+			dispatch_rw_block_io(blkif, &req, pending_req);
+			break;
+
+		default:
+			/* A good sign something is wrong: sleep for a while to
+			 * avoid excessive CPU consumption by a bad guest. */
+			msleep(1);
+			WPRINTK("unknown operation [%d]\n",
+				req.operation);
+			make_response(blkif, req.id, req.operation,
+				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
+			break;
+		}
+
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+	}
+		
+	blktap_kick_user(blkif->dev_num);
+
+	return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req)
+{
+	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+	unsigned int nseg;
+	int ret, i, op, nr_sects = 0;
+	tap_blkif_t *info;
+	blkif_request_t *target;
+	unsigned int mmap_idx = pending_req->mem_idx;
+	unsigned int pending_idx = RTN_PEND_IDX(pending_req, mmap_idx);
+	unsigned int usr_idx;
+	uint32_t flags;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma = NULL;
+
+	if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV)
+		goto fail_response;
+
+	info = tapfds[blkif->dev_num];
+	if (info == NULL)
+		goto fail_response;
+
+	/* Check we have space on user ring - should never fail. */
+	spin_lock(&info->map_lock);
+	usr_idx = GET_NEXT_REQ(info->idx_map);
+	spin_unlock(&info->map_lock);
+	if (usr_idx >= MAX_PENDING_REQS) {
+		WARN_ON(1);
+		goto fail_response;
+	}
+
+	/* Check that number of segments is sane. */
+	nseg = req->nr_segments;
+	if (unlikely(nseg == 0 && req->operation != BLKIF_OP_WRITE_BARRIER) ||
+	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
+		WPRINTK("Bad number of segments in request (%d)\n", nseg);
+		goto fail_response;
+	}
+	
+	/* Make sure userspace is ready. */
+	if (!info->ring_ok) {
+		WPRINTK("ring not ready for requests!\n");
+		goto fail_response;
+	}
+	smp_rmb();
+
+	if (RING_FULL(&info->ufe_ring)) {
+		WPRINTK("fe_ring is full, "
+			"IO Request will be dropped. %d %d\n",
+			RING_SIZE(&info->ufe_ring),
+			RING_SIZE(&blkif->blk_rings.common));
+		goto fail_response;
+	}
+
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->id;
+	pending_req->nr_pages  = nseg;
+
+	flags = GNTMAP_host_map;
+	switch (req->operation) {
+	case BLKIF_OP_WRITE:
+	case BLKIF_OP_WRITE_BARRIER:
+		flags |= GNTMAP_readonly;
+		break;
+	}
+
+	op = 0;
+	mm = info->mm;
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		down_read(&mm->mmap_sem);
+	for (i = 0; i < nseg; i++) {
+		unsigned long uvaddr;
+		unsigned long kvaddr;
+		uint64_t ptep;
+
+		uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
+		kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
+
+		gnttab_set_map_op(&map[op], kvaddr, flags,
+				  req->seg[i].gref, blkif->domid);
+		op++;
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* Now map it to user. */
+			ret = create_lookup_pte_addr(mm, uvaddr, &ptep);
+			if (ret) {
+				up_read(&mm->mmap_sem);
+				WPRINTK("Couldn't get a pte addr!\n");
+				goto fail_response;
+			}
+
+			gnttab_set_map_op(&map[op], ptep,
+					  flags | GNTMAP_application_map
+						| GNTMAP_contains_pte,
+					  req->seg[i].gref, blkif->domid);
+			op++;
+		}
+
+		nr_sects += (req->seg[i].last_sect - 
+			     req->seg[i].first_sect + 1);
+	}
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		down_read(&mm->mmap_sem);
+
+	spin_lock(&info->map_lock);
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
+	BUG_ON(ret);
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		for (i = 0; i < (nseg*2); i+=2) {
+			unsigned long uvaddr;
+			unsigned long offset;
+			struct page *pg;
+
+			uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
+
+			gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
+
+			if (unlikely(map[i].status != GNTST_okay)) {
+				WPRINTK("invalid kernel buffer -- could not remap it\n");
+				ret = 1;
+				map[i].handle = INVALID_GRANT_HANDLE;
+			}
+
+			if (unlikely(map[i+1].status != GNTST_okay)) {
+				WPRINTK("invalid user buffer -- could not remap it\n");
+				ret = 1;
+				map[i+1].handle = INVALID_GRANT_HANDLE;
+			}
+
+			pending_handle(mmap_idx, pending_idx, i/2).kernel 
+				= map[i].handle;
+			pending_handle(mmap_idx, pending_idx, i/2).user   
+				= map[i+1].handle;
+
+			if (ret)
+				continue;
+
+			pg = idx_to_page(mmap_idx, pending_idx, i/2);
+			set_phys_to_machine(page_to_pfn(pg),
+					    FOREIGN_FRAME(map[i].dev_bus_addr
+							  >> PAGE_SHIFT));
+			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
+			info->foreign_map.map[offset] = pg;
+		}
+	} else {
+		for (i = 0; i < nseg; i++) {
+			unsigned long uvaddr;
+			unsigned long offset;
+			struct page *pg;
+
+			uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
+
+			gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
+
+			if (unlikely(map[i].status != GNTST_okay)) {
+				WPRINTK("invalid kernel buffer -- could not remap it\n");
+				ret = 1;
+				map[i].handle = INVALID_GRANT_HANDLE;
+			}
+
+			pending_handle(mmap_idx, pending_idx, i).kernel 
+				= map[i].handle;
+
+			if (ret)
+				continue;
+
+			offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
+			pg = idx_to_page(mmap_idx, pending_idx, i);
+			info->foreign_map.map[offset] = pg;
+		}
+	}
+
+	/*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
+	info->idx_map[usr_idx].mem = mmap_idx;
+	info->idx_map[usr_idx].req = pending_idx;
+
+	spin_unlock(&info->map_lock);
+
+	if (ret)
+		goto fail_flush;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		for (i = 0; i < nseg; i++) {
+			struct page *pg = idx_to_page(mmap_idx, pending_idx, i);
+			unsigned long uvaddr = MMAP_VADDR(info->user_vstart,
+							  usr_idx, i);
+			if (vma && uvaddr >= vma->vm_end) {
+				vma = vma->vm_next;
+				if (vma &&
+				    (uvaddr < vma->vm_start ||
+				     uvaddr >= vma->vm_end))
+					vma = NULL;
+			}
+			if (vma == NULL) {
+				vma = find_vma(mm, uvaddr);
+				/* this virtual area was already munmapped.
+				   so skip to next page */
+				if (!vma)
+					continue;
+			}
+			ret = vm_insert_page(vma, uvaddr, pg);
+			if (ret)
+				goto fail_flush;
+		}
+	}
+	
+	up_read(&mm->mmap_sem);
+
+	blkif_get(blkif);
+	/* Finally, write the request message to the user ring. */
+	target = RING_GET_REQUEST(&info->ufe_ring,
+				  info->ufe_ring.req_prod_pvt);
+	memcpy(target, req, sizeof(*req));
+	target->id = usr_idx;
+	wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
+	info->ufe_ring.req_prod_pvt++;
+
+	switch (req->operation) {
+	case BLKIF_OP_READ:
+		blkif->st_rd_sect += nr_sects;
+		break;
+	case BLKIF_OP_WRITE:
+	case BLKIF_OP_WRITE_BARRIER:
+		blkif->st_wr_sect += nr_sects;
+		break;
+	}
+
+	return;
+
+ fail_flush:
+	up_read(&mm->mmap_sem);
+	WPRINTK("Reached Fail_flush\n");
+	fast_flush_area(pending_req, pending_idx, usr_idx, info);
+ fail_response:
+	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	free_req(pending_req);
+	msleep(1); /* back off a bit */
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+                          unsigned short op, int st)
+{
+	blkif_response_t  resp;
+	unsigned long     flags;
+	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+	int more_to_do = 0;
+	int notify;
+
+	resp.id        = id;
+	resp.operation = op;
+	resp.status    = st;
+
+	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+	/* Place on the response ring for the relevant domain. */
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+		memcpy(RING_GET_RESPONSE(&blk_rings->native,
+					 blk_rings->native.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_32:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
+					 blk_rings->x86_32.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	case BLKIF_PROTOCOL_X86_64:
+		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
+					 blk_rings->x86_64.rsp_prod_pvt),
+		       &resp, sizeof(resp));
+		break;
+	default:
+		BUG();
+	}
+	blk_rings->common.rsp_prod_pvt++;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+
+	if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
+		/*
+		 * Tail check for pending requests. Allows frontend to avoid
+		 * notifications if requests are already in flight (lower
+		 * overheads and promotes batching).
+		 */
+		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+	} else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
+		more_to_do = 1;
+	}
+
+	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+	if (more_to_do)
+		blkif_notify_work(blkif);
+	if (notify)
+		notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+	int i, ret;
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	INIT_LIST_HEAD(&pending_free);
+        for(i = 0; i < 2; i++) {
+		ret = req_increase();
+		if (ret)
+			break;
+	}
+	if (i == 0)
+		return ret;
+
+	tap_blkif_interface_init();
+
+	alloc_pending_reqs = 0;
+
+	tap_blkif_xenbus_init();
+
+	/* Dynamically allocate a major for this device */
+	ret = __register_chrdev(0, 0, MAX_TAP_DEV, "blktap", &blktap_fops);
+
+	if (ret < 0) {
+		WPRINTK("Couldn't register /dev/xen/blktap\n");
+		return -ENOMEM;
+	}	
+	
+	blktap_major = ret;
+
+	/* tapfds[0] is always NULL */
+	blktap_next_minor++;
+
+	DPRINTK("Created misc_dev %d:0 [/dev/xen/blktap0]\n", ret);
+
+	/* Make sure the xen class exists */
+	if (get_xen_class()) {
+		/*
+		 * This will allow udev to create the blktap ctrl device.
+		 * We only want to create blktap0 first.  We don't want
+		 * to flood the sysfs system with needless blktap devices.
+		 * We only create the device when a request of a new device is
+		 * made.
+		 */
+		xen_class_device_create(&blktap_type, NULL,
+					MKDEV(blktap_major, 0), NULL,
+					"blktap0");
+	} else {
+		/* this is bad, but not fatal */
+		WPRINTK("sysfs xen_class not created\n");
+	}
+
+	DPRINTK("Blktap device successfully created\n");
+
+	return 0;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("devname:xen/blktap0");
+MODULE_ALIAS("xen-backend:tap");
diff --git a/drivers/xen/blktap/blocktap.c b/drivers/xen/blktap/blocktap.c
new file mode 100644
index 0000000..31973c0
--- /dev/null
+++ b/drivers/xen/blktap/blocktap.c
@@ -0,0 +1 @@
+#include "blktap.c"
diff --git a/drivers/xen/blktap/common.h b/drivers/xen/blktap/common.h
new file mode 100644
index 0000000..4adeef8
--- /dev/null
+++ b/drivers/xen/blktap/common.h
@@ -0,0 +1,112 @@
+/* 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+
+#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
+                                    __FILE__ , __LINE__ , ## _a )
+
+#define WPRINTK(fmt, args...) pr_warning("blktap: " fmt, ##args)
+
+struct backend_info;
+
+typedef struct blkif_st {
+	/* Unique identifier for this interface. */
+	domid_t           domid;
+	unsigned int      handle;
+	/* Physical parameters of the comms window. */
+	unsigned int      irq;
+	/* Comms information. */
+	enum blkif_protocol blk_protocol;
+	blkif_back_rings_t blk_rings;
+	struct vm_struct *blk_ring_area;
+	/* Back pointer to the backend_info. */
+	struct backend_info *be;
+	/* Private fields. */
+	spinlock_t       blk_ring_lock;
+	atomic_t         refcnt;
+
+	wait_queue_head_t   wq;
+	struct task_struct  *xenblkd;
+	unsigned int        waiting_reqs;
+	struct request_queue *plug;
+
+	/* statistics */
+	unsigned long       st_print;
+	int                 st_rd_req;
+	int                 st_wr_req;
+	int                 st_oo_req;
+	int                 st_pk_req;
+	int                 st_rd_sect;
+	int                 st_wr_sect;
+
+	wait_queue_head_t waiting_to_free;
+
+	int		dev_num;
+	uint64_t        sectors;
+} blkif_t;
+
+blkif_t *tap_alloc_blkif(domid_t domid);
+void tap_blkif_free(blkif_t *, struct xenbus_device *);
+void tap_blkif_kmem_cache_free(blkif_t *blkif);
+int tap_blkif_map(blkif_t *, struct xenbus_device *, grant_ref_t,
+		  evtchn_port_t);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)					\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->refcnt))	\
+			wake_up(&(_b)->waiting_to_free);\
+	} while (0)
+
+
+struct phys_req {
+	unsigned short       dev;
+	unsigned short       nr_sects;
+	struct block_device *bdev;
+	blkif_sector_t       sector_number;
+};
+
+void tap_blkif_interface_init(void);
+
+void tap_blkif_xenbus_init(void);
+
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id);
+int tap_blkif_schedule(void *arg);
+
+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
+void signal_tapdisk(int idx);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blktap/interface.c b/drivers/xen/blktap/interface.c
new file mode 100644
index 0000000..155e767
--- /dev/null
+++ b/drivers/xen/blktap/interface.c
@@ -0,0 +1,140 @@
+/******************************************************************************
+ * drivers/xen/blktap/interface.c
+ * 
+ * Block-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+#include <linux/vmalloc.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *tap_alloc_blkif(domid_t domid)
+{
+	blkif_t *blkif;
+
+	blkif = kmem_cache_zalloc(blkif_cachep, GFP_KERNEL);
+	if (!blkif)
+		return ERR_PTR(-ENOMEM);
+
+	blkif->domid = domid;
+	spin_lock_init(&blkif->blk_ring_lock);
+	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	blkif->st_print = jiffies;
+	init_waitqueue_head(&blkif->waiting_to_free);
+
+	return blkif;
+}
+
+int tap_blkif_map(blkif_t *blkif, struct xenbus_device *dev,
+		  grant_ref_t ring_ref, evtchn_port_t evtchn)
+{
+	struct vm_struct *area;
+	int err;
+
+	/* Already connected through? */
+	if (blkif->irq)
+		return 0;
+
+	area = xenbus_map_ring_valloc(dev, ring_ref);
+	if (IS_ERR(area))
+		return PTR_ERR(area);
+	blkif->blk_ring_area = area;
+
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+	{
+		blkif_sring_t *sring;
+		sring = (blkif_sring_t *)area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_32:
+	{
+		blkif_x86_32_sring_t *sring_x86_32;
+		sring_x86_32 = (blkif_x86_32_sring_t *)area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_64:
+	{
+		blkif_x86_64_sring_t *sring_x86_64;
+		sring_x86_64 = (blkif_x86_64_sring_t *)area->addr;
+		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		blkif->domid, evtchn, tap_blkif_be_int,
+		0, "blkif-backend", blkif);
+	if (err < 0) {
+		xenbus_unmap_ring_vfree(dev, area);
+		blkif->blk_rings.common.sring = NULL;
+		return err;
+	}
+	blkif->irq = err;
+
+	return 0;
+}
+
+void tap_blkif_free(blkif_t *blkif, struct xenbus_device *dev)
+{
+	atomic_dec(&blkif->refcnt);
+	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+	atomic_inc(&blkif->refcnt);
+
+	if (blkif->irq) {
+		unbind_from_irqhandler(blkif->irq, blkif);
+		blkif->irq = 0;
+	}
+
+	if (blkif->blk_rings.common.sring) {
+		xenbus_unmap_ring_vfree(dev, blkif->blk_ring_area);
+		blkif->blk_rings.common.sring = NULL;
+	}
+}
+
+void tap_blkif_kmem_cache_free(blkif_t *blkif)
+{
+	if (!atomic_dec_and_test(&blkif->refcnt))
+		BUG();
+	kmem_cache_free(blkif_cachep, blkif);
+}
+
+void __init tap_blkif_interface_init(void)
+{
+	blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t), 
+					 0, 0, NULL);
+}
diff --git a/drivers/xen/blktap/xenbus.c b/drivers/xen/blktap/xenbus.c
new file mode 100644
index 0000000..10a37c1
--- /dev/null
+++ b/drivers/xen/blktap/xenbus.c
@@ -0,0 +1,517 @@
+/* drivers/xen/blktap/xenbus.c
+ *
+ * Xenbus code for blktap
+ *
+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
+ *
+ * Based on the blkback xenbus code:
+ *
+ * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <linux/kthread.h>
+#include <xen/xenbus.h>
+#include "common.h"
+#include "../core/domctl.h"
+
+
+struct backend_info
+{
+	struct xenbus_device *dev;
+	blkif_t *blkif;
+	struct xenbus_watch backend_watch;
+	int xenbus_id;
+	int group_added;
+};
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static int blktap_remove(struct xenbus_device *dev);
+static int blktap_probe(struct xenbus_device *dev,
+			 const struct xenbus_device_id *id);
+static void tap_backend_changed(struct xenbus_watch *, const char **,
+			    unsigned int);
+static void tap_frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state);
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+        unsigned int i;
+
+        for (i = 0; str[i]; i++)
+                if (str[i] == c) {
+                        if (len == 0)
+                                return i;
+                        len--;
+                }
+        return -ERANGE;
+}
+
+static long get_id(const char *str)
+{
+	int len;
+        const char *ptr;
+	char num[10];
+	
+        len = strsep_len(str, '/', 2);
+	if (len < 0)
+		return -1;
+	
+        ptr = str + len + 1;
+	strlcpy(num, ptr, ARRAY_SIZE(num));
+	DPRINTK("get_id(%s) -> %s\n", str, num);
+	
+        return simple_strtol(num, NULL, 10);
+}				
+
+static int blktap_name(blkif_t *blkif, char *buf)
+{
+	char *devpath, *devname;
+	struct xenbus_device *dev = blkif->be->dev;
+
+	devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+	if (IS_ERR(devpath)) 
+		return PTR_ERR(devpath);
+	
+	if ((devname = strstr(devpath, "/dev/")) != NULL)
+		devname += strlen("/dev/");
+	else
+		devname  = devpath;
+
+	snprintf(buf, TASK_COMM_LEN, "blktap.%d.%s", blkif->domid, devname);
+	kfree(devpath);
+	
+	return 0;
+}
+
+/****************************************************************
+ *  sysfs interface for I/O requests of blktap device
+ */
+
+#define VBD_SHOW(name, format, args...)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		ssize_t ret = -ENODEV;					\
+		struct xenbus_device *dev;				\
+		struct backend_info *be;				\
+									\
+		if (!get_device(_dev))					\
+			return ret;					\
+		dev = to_xenbus_device(_dev);				\
+		if ((be = dev_get_drvdata(&dev->dev)) != NULL)		\
+			ret = sprintf(buf, format, ##args);		\
+		put_device(_dev);					\
+		return ret;						\
+	}								\
+	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *tapstat_attrs[] = {
+	&dev_attr_oo_req.attr,
+	&dev_attr_rd_req.attr,
+	&dev_attr_wr_req.attr,
+	&dev_attr_rd_sect.attr,
+	&dev_attr_wr_sect.attr,
+	NULL
+};
+
+static const struct attribute_group tapstat_group = {
+	.name = "statistics",
+	.attrs = tapstat_attrs,
+};
+
+int xentap_sysfs_addif(struct xenbus_device *dev)
+{
+	int err;
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	err = sysfs_create_group(&dev->dev.kobj, &tapstat_group);
+	if (!err)
+		be->group_added = 1;
+	return err;
+}
+
+void xentap_sysfs_delif(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	sysfs_remove_group(&dev->dev.kobj, &tapstat_group);
+	be->group_added = 0;
+}
+
+static int blktap_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	if (be->group_added)
+		xentap_sysfs_delif(be->dev);
+	if (be->backend_watch.node) {
+		unregister_xenbus_watch(&be->backend_watch);
+		kfree(be->backend_watch.node);
+		be->backend_watch.node = NULL;
+	}
+	if (be->blkif) {
+		if (be->blkif->xenblkd)
+			kthread_stop(be->blkif->xenblkd);
+		signal_tapdisk(be->blkif->dev_num);
+		tap_blkif_free(be->blkif, dev);
+		tap_blkif_kmem_cache_free(be->blkif);
+		be->blkif = NULL;
+	}
+	kfree(be);
+	dev_set_drvdata(&dev->dev, NULL);
+	return 0;
+}
+
+static void tap_update_blkif_status(blkif_t *blkif)
+{ 
+	int err;
+	char name[TASK_COMM_LEN];
+
+	/* Not ready to connect? */
+	if(!blkif->irq || !blkif->sectors) {
+		return;
+	} 
+
+	/* Already connected? */
+	if (blkif->be->dev->state == XenbusStateConnected)
+		return;
+
+	/* Attempt to connect: exit if we fail to. */
+	connect(blkif->be);
+	if (blkif->be->dev->state != XenbusStateConnected)
+		return;
+
+	err = blktap_name(blkif, name);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "get blktap dev name");
+		return;
+	}
+
+	if (!blkif->be->group_added) {
+		err = xentap_sysfs_addif(blkif->be->dev);
+		if (err) {
+			xenbus_dev_fatal(blkif->be->dev, err, 
+					 "creating sysfs entries");
+			return;
+		}
+	}
+
+	blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif, name);
+	if (IS_ERR(blkif->xenblkd)) {
+		err = PTR_ERR(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+		xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd");
+		WPRINTK("Error starting thread %s\n", name);
+	} else
+		DPRINTK("Thread started for domid %d, connected disk %d\n",
+			blkif->domid, blkif->dev_num);
+
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate
+ * the basic structures, and watch the store waiting for the
+ * user-space program to tell us the physical device info.  Switch to
+ * InitWait.
+ */
+static int blktap_probe(struct xenbus_device *dev,
+			 const struct xenbus_device_id *id)
+{
+	int err;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+	be->xenbus_id = get_id(dev->nodename);
+
+	be->blkif = tap_alloc_blkif(dev->otherend_id);
+	if (IS_ERR(be->blkif)) {
+		err = PTR_ERR(be->blkif);
+		be->blkif = NULL;
+		xenbus_dev_fatal(dev, err, "creating block interface");
+		goto fail;
+	}
+
+	/* setup back pointer */
+	be->blkif->be = be;
+	be->blkif->sectors = 0;
+
+	/* set a watch on disk info, waiting for userspace to update details*/
+	err = xenbus_watch_path2(dev, dev->nodename, "info",
+				 &be->backend_watch, tap_backend_changed);
+	if (err)
+		goto fail;
+	
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+	return 0;
+
+fail:
+	DPRINTK("blktap probe failed\n");
+	blktap_remove(dev);
+	return err;
+}
+
+
+/**
+ * Callback received when the user space code has placed the device
+ * information in xenstore. 
+ */
+static void tap_backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	int err;
+	unsigned long info;
+	struct backend_info *be
+		= container_of(watch, struct backend_info, backend_watch);
+	struct xenbus_device *dev = be->dev;
+	
+	/** 
+	 * Check to see whether userspace code has opened the image 
+	 * and written sector
+	 * and disk info to xenstore
+	 */
+	err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info, 
+			    "sectors", "%Lu", &be->blkif->sectors, NULL);
+	if (XENBUS_EXIST_ERR(err))
+		return;
+	if (err) {
+		xenbus_dev_error(dev, err, "getting info");
+		return;
+	}
+
+	DPRINTK("Userspace update on disk info, %lu\n",info);
+
+	/* Associate tap dev with domid*/
+	be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id, 
+					  be->blkif);
+
+	tap_update_blkif_status(be->blkif);
+}
+
+
+static void blkif_disconnect(blkif_t *blkif)
+{
+	if (blkif->xenblkd) {
+		kthread_stop(blkif->xenblkd);
+		blkif->xenblkd = NULL;
+	}
+
+	/* idempotent */
+	tap_blkif_free(blkif, blkif->be->dev);
+}
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void tap_frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	int err;
+
+	DPRINTK("fe_changed(%s,%d)\n", dev->nodename, frontend_state);
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			pr_info("%s: %s: prepare for reconnect\n",
+				__FUNCTION__, dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+		/* Ensure we connect even when two watches fire in 
+		   close successsion and we miss the intermediate value 
+		   of frontend_state. */
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		/* Enforce precondition before potential leak point.
+		 * blkif_disconnect() is idempotent.
+		 */
+		blkif_disconnect(be->blkif);
+
+		err = connect_ring(be);
+		if (err)
+			break;
+		tap_update_blkif_status(be->blkif);
+		break;
+
+	case XenbusStateClosing:
+		blkif_disconnect(be->blkif);
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		/* Implies the effects of blkif_disconnect() via
+		 * blktap_remove().
+		 */
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+
+/**
+ * Switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+	int err;
+
+	struct xenbus_device *dev = be->dev;
+	struct xenbus_transaction xbt;
+
+	/* Write feature-barrier to xenstore */
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		return;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",  "1");
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing feature-barrier");
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+
+	/* Switch state */
+	err = xenbus_switch_state(dev, XenbusStateConnected);
+	if (err)
+		xenbus_dev_fatal(dev, err, "%s: switching to Connected state",
+				 dev->nodename);
+
+	return;
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned int ring_ref, evtchn;
+	char *protocol;
+	int err;
+
+	DPRINTK("%s\n", dev->otherend);
+
+	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%u",
+			    &ring_ref, "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+			    NULL, &protocol, NULL);
+	if (err) {
+		protocol = NULL;
+		be->blkif->blk_protocol = xen_guest_blkif_protocol(be->blkif->domid);
+	}
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+	else {
+		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+		kfree(protocol);
+		return -1;
+	}
+	pr_info("blktap: ring-ref %u, event-channel %u, protocol %d (%s)\n",
+		ring_ref, evtchn, be->blkif->blk_protocol,
+		protocol ?: "unspecified");
+	kfree(protocol);
+
+	/* Map the shared frame, irq etc. */
+	err = tap_blkif_map(be->blkif, dev, ring_ref, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "mapping ring-ref %u port %u",
+				 ring_ref, evtchn);
+		return err;
+	} 
+
+	return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blktap_ids[] = {
+	{ "tap" },
+	{ "" }
+};
+
+static DEFINE_XENBUS_DRIVER(blktap, ,
+	.probe = blktap_probe,
+	.remove = blktap_remove,
+	.otherend_changed = tap_frontend_changed
+);
+
+
+void tap_blkif_xenbus_init(void)
+{
+	WARN_ON(xenbus_register_backend(&blktap_driver));
+}
diff --git a/drivers/xen/blktap2-new/Makefile b/drivers/xen/blktap2-new/Makefile
new file mode 100644
index 0000000..20c98aa
--- /dev/null
+++ b/drivers/xen/blktap2-new/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_BLKDEV_TAP2) := xen-blktap.o
+
+xen-blktap-y := control.o ring.o device.o request.o
+xen-blktap-$(CONFIG_SYSFS) += sysfs.o
diff --git a/drivers/xen/blktap2-new/blktap.h b/drivers/xen/blktap2-new/blktap.h
new file mode 100644
index 0000000..05d5bcb
--- /dev/null
+++ b/drivers/xen/blktap2-new/blktap.h
@@ -0,0 +1,218 @@
+#ifndef _BLKTAP_H_
+#define _BLKTAP_H_
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <xen/blkif.h>
+
+extern int blktap_debug_level;
+extern int blktap_ring_major;
+extern int blktap_device_major;
+
+#define BTPRINTK(level, tag, force, _f, _a...)				\
+	do {								\
+		if (blktap_debug_level > level &&			\
+		    (force || printk_ratelimit()))			\
+			printk(tag "%s: " _f, __func__, ##_a);		\
+	} while (0)
+
+#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
+#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
+#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
+#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
+
+#define BLKTAP2_DEV_DIR "xen/blktap-2/"
+
+#define BLKTAP_DEVICE                4
+#define BLKTAP_DEVICE_CLOSED         5
+#define BLKTAP_SHUTDOWN_REQUESTED    8
+
+/* blktap IOCTLs: */
+#define BLKTAP2_IOCTL_KICK_FE        1
+#define BLKTAP2_IOCTL_ALLOC_TAP      200
+#define BLKTAP2_IOCTL_FREE_TAP       201
+#define BLKTAP2_IOCTL_CREATE_DEVICE  202
+#define BLKTAP2_IOCTL_REMOVE_DEVICE  207
+
+#define BLKTAP2_MAX_MESSAGE_LEN      256
+
+#define BLKTAP2_RING_MESSAGE_CLOSE   3
+
+#define BLKTAP_REQUEST_FREE          0
+#define BLKTAP_REQUEST_PENDING       1
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE		__RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM		BLK_RING_SIZE
+#define MAX_PENDING_REQS	BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req, _seg)					\
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+
+struct grant_handle_pair {
+	grant_handle_t                 kernel;
+	grant_handle_t                 user;
+};
+#define INVALID_GRANT_HANDLE           0xFFFF
+
+struct blktap_handle {
+	unsigned int                   ring;
+	unsigned int                   device;
+	unsigned int                   minor;
+};
+
+struct blktap_params {
+	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+	unsigned long long             capacity;
+	unsigned long                  sector_size;
+};
+
+struct blktap_device {
+	spinlock_t                     lock;
+	struct gendisk                *gd;
+};
+
+struct blktap_ring {
+	struct task_struct            *task;
+
+	struct vm_area_struct         *vma;
+	struct blkif_front_ring        ring;
+	unsigned long                  ring_vstart;
+	unsigned long                  user_vstart;
+
+	int                            n_pending;
+	struct blktap_request         *pending[MAX_PENDING_REQS];
+
+	wait_queue_head_t              poll_wait;
+
+	dev_t                          devno;
+	struct device                 *dev;
+};
+
+struct blktap_statistics {
+	unsigned long                  st_print;
+	int                            st_rd_req;
+	int                            st_wr_req;
+	int                            st_oo_req;
+	int                            st_pk_req;
+	int                            st_rd_sect;
+	int                            st_wr_sect;
+	s64                            st_rd_cnt;
+	s64                            st_rd_sum_usecs;
+	s64                            st_rd_max_usecs;
+	s64                            st_wr_cnt;
+	s64                            st_wr_sum_usecs;
+	s64                            st_wr_max_usecs;	
+};
+
+struct blktap_request {
+	struct blktap                 *tap;
+	struct request                *rq;
+	int                            usr_idx;
+
+	int                            operation;
+	struct timeval                 time;
+
+	struct scatterlist             sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page                   *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	int                            nr_pages;
+};
+
+#define blktap_for_each_sg(_sg, _req, _i)	\
+	for (_sg = (_req)->sg_table, _i = 0;	\
+	     _i < (_req)->nr_pages;		\
+	     (_sg)++, (_i)++)
+
+struct blktap {
+	int                            minor;
+	unsigned long                  dev_inuse;
+
+	struct blktap_ring             ring;
+	struct blktap_device           device;
+	struct blktap_page_pool       *pool;
+
+	wait_queue_head_t              remove_wait;
+	struct work_struct             remove_work;
+	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+
+	struct blktap_statistics       stats;
+};
+
+struct blktap_page_pool {
+	struct mempool_s              *bufs;
+	spinlock_t                     lock;
+	struct kobject                 kobj;
+	wait_queue_head_t              wait;
+};
+
+extern struct mutex blktap_lock;
+extern struct blktap **blktaps;
+extern int blktap_max_minor;
+
+int blktap_control_destroy_tap(struct blktap *);
+size_t blktap_control_debug(struct blktap *, char *, size_t);
+
+int blktap_ring_init(void);
+void blktap_ring_exit(void);
+size_t blktap_ring_debug(struct blktap *, char *, size_t);
+int blktap_ring_create(struct blktap *);
+int blktap_ring_destroy(struct blktap *);
+struct blktap_request *blktap_ring_make_request(struct blktap *);
+void blktap_ring_free_request(struct blktap *,struct blktap_request *);
+void blktap_ring_submit_request(struct blktap *, struct blktap_request *);
+int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int);
+int blktap_ring_map_request(struct blktap *, struct blktap_request *);
+void blktap_ring_unmap_request(struct blktap *, struct blktap_request *);
+void blktap_ring_set_message(struct blktap *, int);
+void blktap_ring_kick_user(struct blktap *);
+
+#ifdef CONFIG_SYSFS
+int blktap_sysfs_init(void);
+void blktap_sysfs_exit(void);
+int blktap_sysfs_create(struct blktap *);
+void blktap_sysfs_destroy(struct blktap *);
+#else
+static inline int blktap_sysfs_init(void) { return 0; }
+static inline void blktap_sysfs_exit(void) {}
+static inline int blktap_sysfs_create(struct blktap *tapdev) { return 0; }
+static inline void blktap_sysfs_destroy(struct blktap *tapdev) {}
+#endif
+
+int blktap_device_init(void);
+void blktap_device_exit(void);
+size_t blktap_device_debug(struct blktap *, char *, size_t);
+int blktap_device_create(struct blktap *, struct blktap_params *);
+int blktap_device_destroy(struct blktap *);
+void blktap_device_destroy_sync(struct blktap *);
+void blktap_device_run_queue(struct blktap *);
+void blktap_device_end_request(struct blktap *, struct blktap_request *, int);
+
+int blktap_page_pool_init(struct kobject *);
+void blktap_page_pool_exit(void);
+struct blktap_page_pool *blktap_page_pool_get(const char *);
+
+size_t blktap_request_debug(struct blktap *, char *, size_t);
+struct blktap_request *blktap_request_alloc(struct blktap *);
+int blktap_request_get_pages(struct blktap *, struct blktap_request *, int);
+void blktap_request_free(struct blktap *, struct blktap_request *);
+void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int);
+
+
+#endif
diff --git a/drivers/xen/blktap2-new/control.c b/drivers/xen/blktap2-new/control.c
new file mode 100644
index 0000000..615df74
--- /dev/null
+++ b/drivers/xen/blktap2-new/control.c
@@ -0,0 +1,316 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+
+#include "blktap.h"
+
+DEFINE_MUTEX(blktap_lock);
+
+struct blktap **blktaps;
+int blktap_max_minor;
+static struct blktap_page_pool *default_pool;
+
+static struct blktap *
+blktap_control_get_minor(void)
+{
+	int minor;
+	struct blktap *tap;
+
+	tap = kzalloc(sizeof(*tap), GFP_KERNEL);
+	if (unlikely(!tap))
+		return NULL;
+
+	mutex_lock(&blktap_lock);
+
+	for (minor = 0; minor < blktap_max_minor; minor++)
+		if (!blktaps[minor])
+			break;
+
+	if (minor == CONFIG_XEN_NR_TAP2_DEVICES)
+		goto fail;
+
+	if (minor == blktap_max_minor) {
+		void *p;
+		int n;
+
+		n = min(2 * blktap_max_minor, CONFIG_XEN_NR_TAP2_DEVICES);
+		p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL);
+		if (!p)
+			goto fail;
+
+		blktaps          = p;
+		minor            = blktap_max_minor;
+		blktap_max_minor = n;
+
+		memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0]));
+	}
+
+	tap->minor = minor;
+	blktaps[minor] = tap;
+
+	__module_get(THIS_MODULE);
+out:
+	mutex_unlock(&blktap_lock);
+	return tap;
+
+fail:
+	mutex_unlock(&blktap_lock);
+	kfree(tap);
+	tap = NULL;
+	goto out;
+}
+
+static void
+blktap_control_put_minor(struct blktap* tap)
+{
+	blktaps[tap->minor] = NULL;
+	kfree(tap);
+
+	module_put(THIS_MODULE);
+}
+
+static struct blktap*
+blktap_control_create_tap(void)
+{
+	struct blktap *tap;
+	int err;
+
+	tap = blktap_control_get_minor();
+	if (!tap)
+		return NULL;
+
+	kobject_get(&default_pool->kobj);
+	tap->pool = default_pool;
+
+	err = blktap_ring_create(tap);
+	if (err)
+		goto fail_tap;
+
+	err = blktap_sysfs_create(tap);
+	if (err)
+		goto fail_ring;
+
+	return tap;
+
+fail_ring:
+	blktap_ring_destroy(tap);
+fail_tap:
+	blktap_control_put_minor(tap);
+
+	return NULL;
+}
+
+int
+blktap_control_destroy_tap(struct blktap *tap)
+{
+	int err;
+
+	err = blktap_ring_destroy(tap);
+	if (err)
+		return err;
+
+	kobject_put(&tap->pool->kobj);
+
+	blktap_sysfs_destroy(tap);
+
+	blktap_control_put_minor(tap);
+
+	return 0;
+}
+
+static long
+blktap_control_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct blktap *tap;
+
+	switch (cmd) {
+	case BLKTAP2_IOCTL_ALLOC_TAP: {
+		struct blktap_handle h;
+		void __user *ptr = (void __user*)arg;
+
+		tap = blktap_control_create_tap();
+		if (!tap)
+			return -ENOMEM;
+
+		h.ring   = blktap_ring_major;
+		h.device = blktap_device_major;
+		h.minor  = tap->minor;
+
+		if (copy_to_user(ptr, &h, sizeof(h))) {
+			blktap_control_destroy_tap(tap);
+			return -EFAULT;
+		}
+
+		return 0;
+	}
+
+	case BLKTAP2_IOCTL_FREE_TAP: {
+		int minor = arg;
+
+		if (minor > CONFIG_XEN_NR_TAP2_DEVICES)
+			return -EINVAL;
+
+		tap = blktaps[minor];
+		if (!tap)
+			return -ENODEV;
+
+		return blktap_control_destroy_tap(tap);
+	}
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static const struct file_operations blktap_control_file_operations = {
+	.owner    = THIS_MODULE,
+	.unlocked_ioctl = blktap_control_ioctl,
+};
+
+static struct miscdevice blktap_control = {
+	.minor    = MISC_DYNAMIC_MINOR,
+	.name     = "blktap-control",
+	.nodename = BLKTAP2_DEV_DIR "control",
+	.fops     = &blktap_control_file_operations,
+};
+
+static struct device *control_device;
+
+static ssize_t
+blktap_control_show_default_pool(struct device *device,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	return sprintf(buf, "%s", kobject_name(&default_pool->kobj));
+}
+
+static ssize_t
+blktap_control_store_default_pool(struct device *device,
+				  struct device_attribute *attr,
+				  const char *buf, size_t size)
+{
+	struct blktap_page_pool *pool, *tmp = default_pool;
+
+	pool = blktap_page_pool_get(buf);
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
+
+	default_pool = pool;
+	kobject_put(&tmp->kobj);
+
+	return size;
+}
+
+static DEVICE_ATTR(default_pool, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+		   blktap_control_show_default_pool,
+		   blktap_control_store_default_pool);
+
+size_t
+blktap_control_debug(struct blktap *tap, char *buf, size_t size)
+{
+	char *s = buf, *end = buf + size;
+
+	s += snprintf(s, end - s,
+		      "tap %u:%u name:'%s' flags:%#08lx\n",
+		      MAJOR(tap->ring.devno), MINOR(tap->ring.devno),
+		      tap->name, tap->dev_inuse);
+
+	return s - buf;
+}
+
+static int __init
+blktap_control_init(void)
+{
+	int err;
+
+	err = misc_register(&blktap_control);
+	if (err)
+		return err;
+
+	control_device = blktap_control.this_device;
+
+	blktap_max_minor = min(64, CONFIG_XEN_NR_TAP2_DEVICES);
+	blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
+	if (!blktaps) {
+		BTERR("failed to allocate blktap minor map");
+		return -ENOMEM;
+	}
+
+	err = blktap_page_pool_init(&control_device->kobj);
+	if (err)
+		return err;
+
+	default_pool = blktap_page_pool_get("default");
+	if (!default_pool)
+		return -ENOMEM;
+
+	err = device_create_file(control_device, &dev_attr_default_pool);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void
+blktap_control_exit(void)
+{
+	if (default_pool) {
+		kobject_put(&default_pool->kobj);
+		default_pool = NULL;
+	}
+
+	blktap_page_pool_exit();
+
+	if (blktaps) {
+		kfree(blktaps);
+		blktaps = NULL;
+	}
+
+	if (control_device) {
+		misc_deregister(&blktap_control);
+		control_device = NULL;
+	}
+}
+
+static void
+blktap_exit(void)
+{
+	blktap_control_exit();
+	blktap_ring_exit();
+	blktap_sysfs_exit();
+	blktap_device_exit();
+}
+
+static int __init
+blktap_init(void)
+{
+	int err;
+
+	err = blktap_device_init();
+	if (err)
+		goto fail;
+
+	err = blktap_ring_init();
+	if (err)
+		goto fail;
+
+	err = blktap_sysfs_init();
+	if (err)
+		goto fail;
+
+	err = blktap_control_init();
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	blktap_exit();
+	return err;
+}
+
+module_init(blktap_init);
+module_exit(blktap_exit);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("devname:" BLKTAP2_DEV_DIR "control");
diff --git a/drivers/xen/blktap2-new/device.c b/drivers/xen/blktap2-new/device.c
new file mode 100644
index 0000000..77f5028
--- /dev/null
+++ b/drivers/xen/blktap2-new/device.c
@@ -0,0 +1,572 @@
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include "blktap.h"
+
+int blktap_device_major;
+
+#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device)
+
+static int
+blktap_device_open(struct block_device *bdev, fmode_t mode)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct blktap_device *tapdev = disk->private_data;
+
+	if (!tapdev)
+		return -ENXIO;
+
+	/* NB. we might have bounced a bd trylock by tapdisk. when
+	 * failing for reasons not !tapdev, make sure to kick tapdisk
+	 * out of destroy wait state again. */
+
+	return 0;
+}
+
+static int
+blktap_device_release(struct gendisk *disk, fmode_t mode)
+{
+	struct blktap_device *tapdev = disk->private_data;
+	struct block_device *bdev = bdget_disk(disk, 0);
+	struct blktap *tap = dev_to_blktap(tapdev);
+
+	bdput(bdev);
+
+	if (!bdev->bd_openers) {
+		set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse);
+		blktap_ring_kick_user(tap);
+	}
+
+	return 0;
+}
+
+static int
+blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+	/* We don't have real geometry info, but let's at least return
+	   values consistent with the size of the device */
+	sector_t nsect = get_capacity(bd->bd_disk);
+	sector_t cylinders = nsect;
+
+	hg->heads = 0xff;
+	hg->sectors = 0x3f;
+	sector_div(cylinders, hg->heads * hg->sectors);
+	hg->cylinders = cylinders;
+	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+		hg->cylinders = 0xffff;
+	return 0;
+}
+
+static int
+blktap_device_ioctl(struct block_device *bd, fmode_t mode,
+		    unsigned command, unsigned long argument)
+{
+	int i;
+
+	switch (command) {
+	case CDROMMULTISESSION:
+		BTDBG("FIXME: support multisession CDs later\n");
+		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+			if (put_user(0, (char __user *)(argument + i)))
+				return -EFAULT;
+		return 0;
+
+	case SCSI_IOCTL_GET_IDLUN:
+		if (!access_ok(VERIFY_WRITE, argument, 
+			sizeof(struct scsi_idlun)))
+			return -EFAULT;
+
+		/* return 0 for now. */
+		__put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+		__put_user(0, 
+			&((struct scsi_idlun __user *)argument)->host_unique_id);
+		return 0;
+
+	default:
+		/*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+		  command);*/
+		return -EINVAL; /* same return as native Linux */
+	}
+
+	return 0;
+}
+
+static const struct block_device_operations blktap_device_file_operations = {
+	.owner     = THIS_MODULE,
+	.open      = blktap_device_open,
+	.release   = blktap_device_release,
+	.ioctl     = blktap_device_ioctl,
+	.getgeo    = blktap_device_getgeo
+};
+
+/* NB. __blktap holding the queue lock; blktap where unlocked */
+
+static inline struct request*
+__blktap_next_queued_rq(struct request_queue *q)
+{
+	return blk_peek_request(q);
+}
+
+static inline void
+__blktap_dequeue_rq(struct request *rq)
+{
+	blk_start_request(rq);
+}
+
+/* NB. err == 0 indicates success, failures < 0 */
+
+static inline void
+__blktap_end_queued_rq(struct request *rq, int err)
+{
+	blk_start_request(rq);
+	__blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+__blktap_end_rq(struct request *rq, int err)
+{
+	__blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+blktap_end_rq(struct request *rq, int err)
+{
+	struct request_queue *q = rq->q;
+
+	spin_lock_irq(q->queue_lock);
+	__blktap_end_rq(rq, err);
+	spin_unlock_irq(q->queue_lock);
+}
+
+void
+blktap_device_end_request(struct blktap *tap,
+			  struct blktap_request *request,
+			  int error)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct request *rq = request->rq;
+
+	blktap_ring_unmap_request(tap, request);
+
+	blktap_ring_free_request(tap, request);
+
+	dev_dbg(disk_to_dev(tapdev->gd),
+		"end_request: op=%d error=%d bytes=%d\n",
+		rq_data_dir(rq), error, blk_rq_bytes(rq));
+
+	blktap_end_rq(rq, error);
+}
+
+int
+blktap_device_make_request(struct blktap *tap, struct request *rq)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct blktap_request *request;
+	int write, nsegs;
+	int err;
+
+	request = blktap_ring_make_request(tap);
+	if (IS_ERR(request)) {
+		err = PTR_ERR(request);
+		request = NULL;
+
+		if (err == -ENOSPC || err == -ENOMEM)
+			goto stop;
+
+		goto fail;
+	}
+
+	write = rq_data_dir(rq) == WRITE;
+	nsegs = blk_rq_map_sg(rq->q, rq, request->sg_table);
+
+	dev_dbg(disk_to_dev(tapdev->gd),
+		"make_request: op=%c bytes=%d nsegs=%d\n",
+		write ? 'w' : 'r', blk_rq_bytes(rq), nsegs);
+
+	request->rq = rq;
+	request->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ;
+	if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
+		request->operation = BLKIF_OP_PACKET;
+
+	err = blktap_request_get_pages(tap, request, nsegs);
+	if (err)
+		goto stop;
+
+	err = blktap_ring_map_request(tap, request);
+	if (err)
+		goto fail;
+
+	blktap_ring_submit_request(tap, request);
+
+	return 0;
+
+stop:
+	tap->stats.st_oo_req++;
+	err = -EBUSY;
+
+_out:
+	if (request)
+		blktap_ring_free_request(tap, request);
+
+	return err;
+fail:
+	if (printk_ratelimit())
+		dev_warn(disk_to_dev(tapdev->gd),
+			 "make request: %d, failing\n", err);
+	goto _out;
+}
+
+/*
+ * called from tapdisk context
+ */
+void
+blktap_device_run_queue(struct blktap *tap)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct request_queue *q;
+	struct request *rq;
+	int err;
+
+	if (!tapdev->gd)
+		return;
+
+	q = tapdev->gd->queue;
+
+	spin_lock_irq(&tapdev->lock);
+	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+	do {
+		rq = __blktap_next_queued_rq(q);
+		if (!rq)
+			break;
+
+		if (rq->cmd_type != REQ_TYPE_FS) {
+			rq->errors = (DID_ERROR << 16) |
+				     (DRIVER_INVALID << 24);
+			__blktap_end_queued_rq(rq, -EOPNOTSUPP);
+			continue;
+		}
+
+		spin_unlock_irq(&tapdev->lock);
+
+		err = blktap_device_make_request(tap, rq);
+
+		spin_lock_irq(&tapdev->lock);
+
+		if (err == -EBUSY) {
+			blk_stop_queue(q);
+			break;
+		}
+
+		__blktap_dequeue_rq(rq);
+
+		if (unlikely(err))
+			__blktap_end_rq(rq, err);
+	} while (1);
+
+	spin_unlock_irq(&tapdev->lock);
+}
+
+static void
+blktap_device_do_request(struct request_queue *rq)
+{
+	struct blktap_device *tapdev = rq->queuedata;
+	struct blktap *tap = dev_to_blktap(tapdev);
+
+	blktap_ring_kick_user(tap);
+}
+
+static void
+blktap_device_configure(struct blktap *tap,
+			struct blktap_params *params)
+{
+	struct request_queue *rq;
+	struct blktap_device *dev = &tap->device;
+
+	dev = &tap->device;
+	rq  = dev->gd->queue;
+
+	spin_lock_irq(&dev->lock);
+
+	set_capacity(dev->gd, params->capacity);
+
+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
+	blk_queue_logical_block_size(rq, params->sector_size);
+	blk_queue_max_hw_sectors(rq, 512);
+
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+	/* Ensure a merged request will fit in a single I/O ring slot. */
+	blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+	/* Make sure buffer addresses are sector-aligned. */
+	blk_queue_dma_alignment(rq, 511);
+
+	spin_unlock_irq(&dev->lock);
+}
+
+static int
+blktap_device_validate_params(struct blktap *tap,
+			      struct blktap_params *params)
+{
+	struct device *dev = tap->ring.dev;
+	int sector_order, name_sz;
+
+	sector_order = ffs(params->sector_size) - 1;
+
+	if (sector_order <  9 ||
+	    sector_order > 12 ||
+	    params->sector_size != 1U<<sector_order)
+		goto fail;
+
+	if (!params->capacity ||
+	    (params->capacity > ULLONG_MAX >> sector_order))
+		goto fail;
+
+	name_sz = min(sizeof(params->name), sizeof(tap->name));
+	if (strnlen(params->name, name_sz) >= name_sz)
+		goto fail;
+
+	return 0;
+
+fail:
+	params->name[name_sz-1] = 0;
+	dev_err(dev, "capacity: %llu, sector-size: %lu, name: %s\n",
+		params->capacity, params->sector_size, params->name);
+	return -EINVAL;
+}
+
+int
+blktap_device_destroy(struct blktap *tap)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct block_device *bdev;
+	struct gendisk *gd;
+	int err;
+
+	gd = tapdev->gd;
+	if (!gd)
+		return 0;
+
+	bdev = bdget_disk(gd, 0);
+
+	err = !mutex_trylock(&bdev->bd_mutex);
+	if (err) {
+		/* NB. avoid a deadlock. the last opener syncs the
+		 * bdev holding bd_mutex. */
+		err = -EBUSY;
+		goto out_nolock;
+	}
+
+	if (bdev->bd_openers) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	del_gendisk(gd);
+	gd->private_data = NULL;
+
+	blk_cleanup_queue(gd->queue);
+
+	put_disk(gd);
+	tapdev->gd = NULL;
+
+	clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+	err = 0;
+out:
+	mutex_unlock(&bdev->bd_mutex);
+out_nolock:
+	bdput(bdev);
+
+	return err;
+}
+
+static void
+blktap_device_fail_queue(struct blktap *tap)
+{
+	struct blktap_device *tapdev = &tap->device;
+	struct request_queue *q = tapdev->gd->queue;
+
+	spin_lock_irq(&tapdev->lock);
+	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+	do {
+		struct request *rq = __blktap_next_queued_rq(q);
+		if (!rq)
+			break;
+
+		__blktap_end_queued_rq(rq, -EIO);
+	} while (1);
+
+	spin_unlock_irq(&tapdev->lock);
+}
+
+static int
+blktap_device_try_destroy(struct blktap *tap)
+{
+	int err;
+
+	err = blktap_device_destroy(tap);
+	if (err)
+		blktap_device_fail_queue(tap);
+
+	return err;
+}
+
+void
+blktap_device_destroy_sync(struct blktap *tap)
+{
+	wait_event(tap->ring.poll_wait,
+		   !blktap_device_try_destroy(tap));
+}
+
+static char *blktap_devnode(struct gendisk *gd, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "tapdev%u",
+			 gd->first_minor);
+}
+
+int
+blktap_device_create(struct blktap *tap, struct blktap_params *params)
+{
+	int minor, err;
+	struct gendisk *gd;
+	struct request_queue *rq;
+	struct blktap_device *tapdev;
+
+	gd     = NULL;
+	rq     = NULL;
+	tapdev = &tap->device;
+	minor  = tap->minor;
+
+	if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+		return -EEXIST;
+
+	if (blktap_device_validate_params(tap, params))
+		return -EINVAL;
+
+	gd = alloc_disk(1);
+	if (!gd) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	if (minor < 26) {
+		sprintf(gd->disk_name, "td%c", 'a' + minor % 26);
+	} else if (minor < (26 + 1) * 26) {
+		sprintf(gd->disk_name, "td%c%c",
+			'a' + minor / 26 - 1,'a' + minor % 26);
+	} else {
+		const unsigned int m1 = (minor / 26 - 1) / 26 - 1;
+		const unsigned int m2 = (minor / 26 - 1) % 26;
+		const unsigned int m3 =  minor % 26;
+		sprintf(gd->disk_name, "td%c%c%c",
+			'a' + m1, 'a' + m2, 'a' + m3);
+	}
+
+	gd->major = blktap_device_major;
+	gd->first_minor = minor;
+	gd->devnode = blktap_devnode;
+	gd->fops = &blktap_device_file_operations;
+	gd->private_data = tapdev;
+
+	spin_lock_init(&tapdev->lock);
+	rq = blk_init_queue(blktap_device_do_request, &tapdev->lock);
+	if (!rq) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	elevator_init(rq, "noop");
+
+	gd->queue     = rq;
+	rq->queuedata = tapdev;
+	tapdev->gd    = gd;
+
+	blktap_device_configure(tap, params);
+	add_disk(gd);
+
+	strlcpy(tap->name, params->name, ARRAY_SIZE(tap->name));
+
+	set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+
+	dev_info(disk_to_dev(gd), "sector-size: %u capacity: %llu\n",
+		 queue_logical_block_size(rq),
+		 (unsigned long long)get_capacity(gd));
+
+	return 0;
+
+fail:
+	if (gd)
+		del_gendisk(gd);
+	if (rq)
+		blk_cleanup_queue(rq);
+
+	return err;
+}
+
+size_t
+blktap_device_debug(struct blktap *tap, char *buf, size_t size)
+{
+	struct gendisk *disk = tap->device.gd;
+	struct request_queue *q;
+	struct block_device *bdev;
+	char *s = buf, *end = buf + size;
+
+	if (!disk)
+		return 0;
+
+	q = disk->queue;
+
+	s += snprintf(s, end - s,
+		      "disk capacity:%llu sector size:%u\n",
+		      (unsigned long long)get_capacity(disk),
+		      queue_logical_block_size(q));
+
+	s += snprintf(s, end - s,
+		      "queue flags:%#lx stopped:%d\n",
+		      q->queue_flags,
+		      blk_queue_stopped(q));
+
+	bdev = bdget_disk(disk, 0);
+	if (bdev) {
+		s += snprintf(s, end - s,
+			      "bdev openers:%d closed:%d\n",
+			      bdev->bd_openers,
+			      test_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse));
+		bdput(bdev);
+	}
+
+	return s - buf;
+}
+
+int __init
+blktap_device_init()
+{
+	int major;
+
+	/* Dynamically allocate a major for this device */
+	major = register_blkdev(0, "tapdev");
+	if (major < 0) {
+		BTERR("Couldn't register blktap device\n");
+		return -ENOMEM;
+	}
+
+	blktap_device_major = major;
+	BTINFO("blktap device major %d\n", major);
+
+	return 0;
+}
+
+void
+blktap_device_exit(void)
+{
+	if (blktap_device_major)
+		unregister_blkdev(blktap_device_major, "tapdev");
+}
diff --git a/drivers/xen/blktap2-new/request.c b/drivers/xen/blktap2-new/request.c
new file mode 100644
index 0000000..9bef48c
--- /dev/null
+++ b/drivers/xen/blktap2-new/request.c
@@ -0,0 +1,418 @@
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+
+#include "blktap.h"
+
+/* max pages per shared pool. just to prevent accidental dos. */
+#define POOL_MAX_PAGES           (256*BLKIF_MAX_SEGMENTS_PER_REQUEST)
+
+/* default page pool size. when considering to shrink a shared pool,
+ * note that paused tapdisks may grab a whole lot of pages for a long
+ * time. */
+#define POOL_DEFAULT_PAGES       (2 * MMAP_PAGES)
+
+/* max number of pages allocatable per request. */
+#define POOL_MAX_REQUEST_PAGES   BLKIF_MAX_SEGMENTS_PER_REQUEST
+
+/* min request structs per pool. These grow dynamically. */
+#define POOL_MIN_REQS            BLK_RING_SIZE
+
+static struct kset *pool_set;
+
+#define kobj_to_pool(_kobj) \
+	container_of(_kobj, struct blktap_page_pool, kobj)
+
+static struct kmem_cache *request_cache;
+static mempool_t *request_pool;
+
+static void
+__page_pool_wake(struct blktap_page_pool *pool)
+{
+	mempool_t *mem = pool->bufs;
+
+	/*
+	  NB. slightly wasteful to always wait for a full segment
+	  set. but this ensures the next disk makes
+	  progress. presently, the repeated request struct
+	  alloc/release cycles would otherwise keep everyone spinning.
+	*/
+
+	if (mem->curr_nr >= POOL_MAX_REQUEST_PAGES)
+		wake_up(&pool->wait);
+}
+
+int
+blktap_request_get_pages(struct blktap *tap,
+			 struct blktap_request *request, int nr_pages)
+{
+	struct blktap_page_pool *pool = tap->pool;
+	mempool_t *mem = pool->bufs;
+	struct page *page;
+
+	BUG_ON(request->nr_pages != 0);
+	BUG_ON(nr_pages > POOL_MAX_REQUEST_PAGES);
+
+	if (mem->curr_nr < nr_pages)
+		return -ENOMEM;
+
+	/* NB. avoid thundering herds of tapdisks colliding. */
+	spin_lock(&pool->lock);
+
+	if (mem->curr_nr < nr_pages) {
+		spin_unlock(&pool->lock);
+		return -ENOMEM;
+	}
+
+	while (request->nr_pages < nr_pages) {
+		page = mempool_alloc(mem, GFP_NOWAIT);
+		BUG_ON(!page);
+		request->pages[request->nr_pages++] = page;
+	}
+
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+
+static void
+blktap_request_put_pages(struct blktap *tap,
+			 struct blktap_request *request)
+{
+	struct blktap_page_pool *pool = tap->pool;
+	struct page *page;
+
+	while (request->nr_pages) {
+		page = request->pages[--request->nr_pages];
+		mempool_free(page, pool->bufs);
+	}
+}
+
+size_t
+blktap_request_debug(struct blktap *tap, char *buf, size_t size)
+{
+	struct blktap_page_pool *pool = tap->pool;
+	mempool_t *mem = pool->bufs;
+	char *s = buf, *end = buf + size;
+
+	s += snprintf(buf, end - s,
+		      "pool:%s pages:%d free:%d\n",
+		      kobject_name(&pool->kobj),
+		      mem->min_nr, mem->curr_nr);
+
+	return s - buf;
+}
+
+struct blktap_request*
+blktap_request_alloc(struct blktap *tap)
+{
+	struct blktap_request *request;
+
+	request = mempool_alloc(request_pool, GFP_NOWAIT);
+	if (request)
+		request->tap = tap;
+
+	return request;
+}
+
+void
+blktap_request_free(struct blktap *tap,
+		    struct blktap_request *request)
+{
+	blktap_request_put_pages(tap, request);
+
+	mempool_free(request, request_pool);
+
+	__page_pool_wake(tap->pool);
+}
+
+void
+blktap_request_bounce(struct blktap *tap,
+		      struct blktap_request *request,
+		      int seg, int write)
+{
+	struct scatterlist *sg = &request->sg_table[seg];
+	void *s, *p;
+
+	BUG_ON(seg >= request->nr_pages);
+
+	s = sg_virt(sg);
+	p = page_address(request->pages[seg]) + sg->offset;
+
+	if (write)
+		memcpy(p, s, sg->length);
+	else
+		memcpy(s, p, sg->length);
+}
+
+static void
+blktap_request_ctor(void *obj)
+{
+	struct blktap_request *request = obj;
+
+	memset(request, 0, sizeof(*request));
+	sg_init_table(request->sg_table, ARRAY_SIZE(request->sg_table));
+}
+
+static int
+blktap_page_pool_resize(struct blktap_page_pool *pool, int target)
+{
+	mempool_t *bufs = pool->bufs;
+	int err;
+
+	/* NB. mempool asserts min_nr >= 1 */
+	target = max(1, target);
+
+	err = mempool_resize(bufs, target, GFP_KERNEL);
+	if (err)
+		return err;
+
+	__page_pool_wake(pool);
+
+	return 0;
+}
+
+struct pool_attribute {
+	struct attribute attr;
+
+	ssize_t (*show)(struct blktap_page_pool *pool,
+			char *buf);
+
+	ssize_t (*store)(struct blktap_page_pool *pool,
+			 const char *buf, size_t count);
+};
+
+#define kattr_to_pool_attr(_kattr) \
+	container_of(_kattr, struct pool_attribute, attr)
+
+static ssize_t
+blktap_page_pool_show_size(struct blktap_page_pool *pool,
+			   char *buf)
+{
+	mempool_t *mem = pool->bufs;
+	return sprintf(buf, "%d", mem->min_nr);
+}
+
+static ssize_t
+blktap_page_pool_store_size(struct blktap_page_pool *pool,
+			    const char *buf, size_t size)
+{
+	int target;
+
+	/*
+	 * NB. target fixup to avoid undesired results. less than a
+	 * full segment set can wedge the disk. much more than a
+	 * couple times the physical queue depth is rarely useful.
+	 */
+
+	target = simple_strtoul(buf, NULL, 0);
+	target = max(POOL_MAX_REQUEST_PAGES, target);
+	target = min(target, POOL_MAX_PAGES);
+
+	return blktap_page_pool_resize(pool, target) ? : size;
+}
+
+static struct pool_attribute blktap_page_pool_attr_size =
+	__ATTR(size, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+	       blktap_page_pool_show_size,
+	       blktap_page_pool_store_size);
+
+static ssize_t
+blktap_page_pool_show_free(struct blktap_page_pool *pool,
+			   char *buf)
+{
+	mempool_t *mem = pool->bufs;
+	return sprintf(buf, "%d", mem->curr_nr);
+}
+
+static struct pool_attribute blktap_page_pool_attr_free =
+	__ATTR(free, S_IRUSR|S_IRGRP|S_IROTH,
+	       blktap_page_pool_show_free,
+	       NULL);
+
+static struct attribute *blktap_page_pool_attrs[] = {
+	&blktap_page_pool_attr_size.attr,
+	&blktap_page_pool_attr_free.attr,
+	NULL,
+};
+
+static inline struct kobject*
+__blktap_kset_find_obj(struct kset *kset, const char *name)
+{
+	struct kobject *k;
+	struct kobject *ret = NULL;
+
+	spin_lock(&kset->list_lock);
+	list_for_each_entry(k, &kset->list, entry) {
+		if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
+			ret = kobject_get(k);
+			break;
+		}
+	}
+	spin_unlock(&kset->list_lock);
+	return ret;
+}
+
+static ssize_t
+blktap_page_pool_show_attr(struct kobject *kobj, struct attribute *kattr,
+			   char *buf)
+{
+	struct blktap_page_pool *pool = kobj_to_pool(kobj);
+	struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+	if (attr->show)
+		return attr->show(pool, buf);
+
+	return -EIO;
+}
+
+static ssize_t
+blktap_page_pool_store_attr(struct kobject *kobj, struct attribute *kattr,
+			    const char *buf, size_t size)
+{
+	struct blktap_page_pool *pool = kobj_to_pool(kobj);
+	struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+	if (attr->show)
+		return attr->store(pool, buf, size);
+
+	return -EIO;
+}
+
+static struct sysfs_ops blktap_page_pool_sysfs_ops = {
+	.show		= blktap_page_pool_show_attr,
+	.store		= blktap_page_pool_store_attr,
+};
+
+static void
+blktap_page_pool_release(struct kobject *kobj)
+{
+	struct blktap_page_pool *pool = kobj_to_pool(kobj);
+	mempool_destroy(pool->bufs);
+	kfree(pool);
+}
+
+struct kobj_type blktap_page_pool_ktype = {
+	.release       = blktap_page_pool_release,
+	.sysfs_ops     = &blktap_page_pool_sysfs_ops,
+	.default_attrs = blktap_page_pool_attrs,
+};
+
+static void*
+__mempool_page_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	struct page *page;
+
+	if (!(gfp_mask & __GFP_WAIT))
+		return NULL;
+
+	page = alloc_page(gfp_mask);
+	if (page)
+		SetPageReserved(page);
+
+	return page;
+}
+
+static void
+__mempool_page_free(void *element, void *pool_data)
+{
+	struct page *page = element;
+
+	ClearPageReserved(page);
+	put_page(page);
+}
+
+static struct kobject*
+blktap_page_pool_create(const char *name, int nr_pages)
+{
+	struct blktap_page_pool *pool;
+	int err;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		goto fail;
+
+	spin_lock_init(&pool->lock);
+	init_waitqueue_head(&pool->wait);
+
+	pool->bufs = mempool_create(nr_pages,
+				    __mempool_page_alloc, __mempool_page_free,
+				    pool);
+	if (!pool->bufs)
+		goto fail_pool;
+
+	kobject_init(&pool->kobj, &blktap_page_pool_ktype);
+	pool->kobj.kset = pool_set;
+	err = kobject_add(&pool->kobj, &pool_set->kobj, "%s", name);
+	if (err)
+		goto fail_bufs;
+
+	return &pool->kobj;
+
+	kobject_del(&pool->kobj);
+fail_bufs:
+	mempool_destroy(pool->bufs);
+fail_pool:
+	kfree(pool);
+fail:
+	return NULL;
+}
+
+struct blktap_page_pool*
+blktap_page_pool_get(const char *name)
+{
+	struct kobject *kobj;
+
+	kobj = __blktap_kset_find_obj(pool_set, name);
+	if (!kobj)
+		kobj = blktap_page_pool_create(name,
+					       POOL_DEFAULT_PAGES);
+	if (!kobj)
+		return ERR_PTR(-ENOMEM);
+
+	return kobj_to_pool(kobj);
+}
+
+int __init
+blktap_page_pool_init(struct kobject *parent)
+{
+	request_cache =
+		kmem_cache_create("blktap-request",
+				  sizeof(struct blktap_request), 0,
+				  0, blktap_request_ctor);
+	if (!request_cache)
+		return -ENOMEM;
+
+	request_pool =
+		mempool_create_slab_pool(POOL_MIN_REQS, request_cache);
+	if (!request_pool)
+		return -ENOMEM;
+
+	pool_set = kset_create_and_add("pools", NULL, parent);
+	if (!pool_set)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void
+blktap_page_pool_exit(void)
+{
+	if (pool_set) {
+		BUG_ON(!list_empty(&pool_set->list));
+		kset_unregister(pool_set);
+		pool_set = NULL;
+	}
+
+	if (request_pool) {
+		mempool_destroy(request_pool);
+		request_pool = NULL;
+	}
+
+	if (request_cache) {
+		kmem_cache_destroy(request_cache);
+		request_cache = NULL;
+	}
+}
diff --git a/drivers/xen/blktap2-new/ring.c b/drivers/xen/blktap2-new/ring.c
new file mode 100644
index 0000000..24ce5c9
--- /dev/null
+++ b/drivers/xen/blktap2-new/ring.c
@@ -0,0 +1,547 @@
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/poll.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_ring_major;
+
+ /* 
+  * BLKTAP - immediately before the mmap area,
+  * we have a bunch of pages reserved for shared memory rings.
+  */
+#define RING_PAGES 1
+
+static void
+blktap_ring_read_response(struct blktap *tap,
+		     const struct blkif_response *rsp)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blktap_request *request;
+	int usr_idx, err;
+
+	request = NULL;
+
+	usr_idx = rsp->id;
+	if (usr_idx < 0 || usr_idx >= MAX_PENDING_REQS) {
+		err = -ERANGE;
+		goto invalid;
+	}
+
+	request = ring->pending[usr_idx];
+
+	if (!request) {
+		err = -ESRCH;
+		goto invalid;
+	}
+
+	if (rsp->operation != request->operation) {
+		err = -EINVAL;
+		goto invalid;
+	}
+
+	dev_dbg(ring->dev,
+		"request %d [%p] response: %d\n",
+		request->usr_idx, request, rsp->status);
+
+	err = rsp->status == BLKIF_RSP_OKAY ? 0 : -EIO;
+end_request:
+	blktap_device_end_request(tap, request, err);
+	return;
+
+invalid:
+	dev_warn(ring->dev,
+		 "invalid response, idx:%d status:%d op:%d/%d: err %d\n",
+		 usr_idx, rsp->status,
+		 rsp->operation, request->operation,
+		 err);
+	if (request)
+		goto end_request;
+}
+
+static void
+blktap_read_ring(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blkif_response rsp;
+	RING_IDX rc, rp;
+
+	down_read(&current->mm->mmap_sem);
+	if (!ring->vma) {
+		up_read(&current->mm->mmap_sem);
+		return;
+	}
+
+	/* for each outstanding message on the ring  */
+	rp = ring->ring.sring->rsp_prod;
+	rmb();
+
+	for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
+		memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp));
+		blktap_ring_read_response(tap, &rsp);
+	}
+
+	ring->ring.rsp_cons = rc;
+
+	up_read(&current->mm->mmap_sem);
+}
+
+static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+static void
+blktap_ring_fail_pending(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blktap_request *request;
+	int usr_idx;
+
+	for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+		request = ring->pending[usr_idx];
+		if (!request)
+			continue;
+
+		blktap_device_end_request(tap, request, -EIO);
+	}
+}
+
+static void
+blktap_ring_vm_close(struct vm_area_struct *vma)
+{
+	struct blktap *tap = vma->vm_private_data;
+	struct blktap_ring *ring = &tap->ring;
+	struct page *page = virt_to_page(ring->ring.sring);
+
+	blktap_ring_fail_pending(tap);
+
+	zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+	ClearPageReserved(page);
+	__free_page(page);
+
+	ring->vma = NULL;
+
+	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		blktap_control_destroy_tap(tap);
+}
+
+static struct vm_operations_struct blktap_ring_vm_operations = {
+	.close    = blktap_ring_vm_close,
+	.fault    = blktap_ring_fault,
+};
+
+int
+blktap_ring_map_segment(struct blktap *tap,
+			struct blktap_request *request,
+			int seg)
+{
+	struct blktap_ring *ring = &tap->ring;
+	unsigned long uaddr;
+
+	uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
+	return vm_insert_page(ring->vma, uaddr, request->pages[seg]);
+}
+
+int
+blktap_ring_map_request(struct blktap *tap,
+			struct blktap_request *request)
+{
+	int seg, err = 0;
+	int write;
+
+	write = request->operation != BLKIF_OP_READ;
+
+	for (seg = 0; seg < request->nr_pages; seg++) {
+		if (write)
+			blktap_request_bounce(tap, request, seg, 1);
+
+		err = blktap_ring_map_segment(tap, request, seg);
+		if (err)
+			break;
+	}
+
+	if (err)
+		blktap_ring_unmap_request(tap, request);
+
+	return err;
+}
+
+void
+blktap_ring_unmap_request(struct blktap *tap,
+			  struct blktap_request *request)
+{
+	struct blktap_ring *ring = &tap->ring;
+	unsigned long uaddr;
+	unsigned size;
+	int seg, read;
+
+	uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
+	size  = request->nr_pages << PAGE_SHIFT;
+	read  = request->operation != BLKIF_OP_WRITE;
+
+	if (read)
+		for (seg = 0; seg < request->nr_pages; seg++)
+			blktap_request_bounce(tap, request, seg, 0);
+
+	zap_page_range(ring->vma, uaddr, size, NULL);
+}
+
+void
+blktap_ring_free_request(struct blktap *tap,
+			 struct blktap_request *request)
+{
+	struct blktap_ring *ring = &tap->ring;
+
+	ring->pending[request->usr_idx] = NULL;
+	ring->n_pending--;
+
+	blktap_request_free(tap, request);
+}
+
+struct blktap_request*
+blktap_ring_make_request(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blktap_request *request;
+	int usr_idx;
+
+	if (RING_FULL(&ring->ring))
+		return ERR_PTR(-ENOSPC);
+
+	request = blktap_request_alloc(tap);
+	if (!request)
+		return ERR_PTR(-ENOMEM);
+
+	for (usr_idx = 0; usr_idx < BLK_RING_SIZE; usr_idx++)
+		if (!ring->pending[usr_idx])
+			break;
+
+	BUG_ON(usr_idx >= BLK_RING_SIZE);
+
+	request->tap     = tap;
+	request->usr_idx = usr_idx;
+
+	ring->pending[usr_idx] = request;
+	ring->n_pending++;
+
+	return request;
+}
+
+void
+blktap_ring_submit_request(struct blktap *tap,
+			   struct blktap_request *request)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct blkif_request *breq;
+	struct scatterlist *sg;
+	int i, nsecs = 0;
+
+	dev_dbg(ring->dev,
+		"request %d [%p] submit\n", request->usr_idx, request);
+
+	breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
+
+	breq->id            = request->usr_idx;
+	breq->sector_number = blk_rq_pos(request->rq);
+	breq->handle        = 0;
+	breq->operation     = request->operation;
+	breq->nr_segments   = request->nr_pages;
+
+	blktap_for_each_sg(sg, request, i) {
+		struct blkif_request_segment *seg = &breq->seg[i];
+		int first, count;
+
+		count = sg->length >> 9;
+		first = sg->offset >> 9;
+
+		seg->first_sect = first;
+		seg->last_sect  = first + count - 1;
+
+		nsecs += count;
+	}
+
+	ring->ring.req_prod_pvt++;
+
+	do_gettimeofday(&request->time);
+
+
+	switch (request->operation) {
+	case BLKIF_OP_WRITE:
+		tap->stats.st_wr_sect += nsecs;
+		tap->stats.st_wr_req++;
+		break;
+
+	case BLKIF_OP_READ:
+		tap->stats.st_rd_sect += nsecs;
+		tap->stats.st_rd_req++;
+		break;
+
+	case BLKIF_OP_PACKET:
+		tap->stats.st_pk_req++;
+		break;
+	}
+}
+
+static int
+blktap_ring_open(struct inode *inode, struct file *filp)
+{
+	struct blktap *tap = NULL;
+	int minor;
+
+	minor = iminor(inode);
+
+	if (minor < blktap_max_minor)
+		tap = blktaps[minor];
+
+	if (!tap)
+		return -ENXIO;
+
+	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		return -ENXIO;
+
+	if (tap->ring.task)
+		return -EBUSY;
+
+	filp->private_data = tap;
+	tap->ring.task = current;
+
+	return 0;
+}
+
+static int
+blktap_ring_release(struct inode *inode, struct file *filp)
+{
+	struct blktap *tap = filp->private_data;
+
+	blktap_device_destroy_sync(tap);
+
+	tap->ring.task = NULL;
+
+	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		blktap_control_destroy_tap(tap);
+
+	return 0;
+}
+
+static int
+blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct blktap *tap = filp->private_data;
+	struct blktap_ring *ring = &tap->ring;
+	struct blkif_sring *sring;
+	struct page *page = NULL;
+	int err;
+
+	if (ring->vma)
+		return -EBUSY;
+
+	page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+
+	SetPageReserved(page);
+
+	err = vm_insert_page(vma, vma->vm_start, page);
+	if (err)
+		goto fail;
+
+	sring = page_address(page);
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
+
+	ring->ring_vstart = vma->vm_start;
+	ring->user_vstart = ring->ring_vstart + PAGE_SIZE;
+
+	vma->vm_private_data = tap;
+
+	vma->vm_flags |= VM_DONTCOPY;
+	vma->vm_flags |= VM_RESERVED;
+
+	vma->vm_ops = &blktap_ring_vm_operations;
+
+	ring->vma = vma;
+	return 0;
+
+fail:
+	if (page) {
+		zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+		ClearPageReserved(page);
+		__free_page(page);
+	}
+
+	return err;
+}
+
+static long
+blktap_ring_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct blktap *tap = filp->private_data;
+	struct blktap_ring *ring = &tap->ring;
+
+	BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
+
+	if (!ring->vma || ring->vma->vm_mm != current->mm)
+		return -EACCES;
+
+	switch(cmd) {
+	case BLKTAP2_IOCTL_KICK_FE:
+
+		blktap_read_ring(tap);
+		return 0;
+
+	case BLKTAP2_IOCTL_CREATE_DEVICE: {
+		struct blktap_params params;
+		void __user *ptr = (void *)arg;
+
+		if (!arg)
+			return -EINVAL;
+
+		if (copy_from_user(&params, ptr, sizeof(params)))
+			return -EFAULT;
+
+		return blktap_device_create(tap, &params);
+	}
+
+	case BLKTAP2_IOCTL_REMOVE_DEVICE:
+
+		return blktap_device_destroy(tap);
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
+{
+	struct blktap *tap = filp->private_data;
+	struct blktap_ring *ring = &tap->ring;
+	int work;
+
+	poll_wait(filp, &tap->pool->wait, wait);
+	poll_wait(filp, &ring->poll_wait, wait);
+
+	down_read(&current->mm->mmap_sem);
+	if (ring->vma && tap->device.gd)
+		blktap_device_run_queue(tap);
+	up_read(&current->mm->mmap_sem);
+
+	work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod;
+	RING_PUSH_REQUESTS(&ring->ring);
+
+	if (work ||
+	    ring->ring.sring->private.tapif_user.msg ||
+	    test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse))
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static const struct file_operations blktap_ring_file_operations = {
+	.owner    = THIS_MODULE,
+	.open     = blktap_ring_open,
+	.release  = blktap_ring_release,
+	.unlocked_ioctl = blktap_ring_ioctl,
+	.mmap     = blktap_ring_mmap,
+	.poll     = blktap_ring_poll,
+};
+
+void
+blktap_ring_kick_user(struct blktap *tap)
+{
+	wake_up(&tap->ring.poll_wait);
+}
+
+int
+blktap_ring_destroy(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+
+	if (ring->task || ring->vma)
+		return -EBUSY;
+
+	return 0;
+}
+
+int
+blktap_ring_create(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+
+	init_waitqueue_head(&ring->poll_wait);
+	ring->devno = MKDEV(blktap_ring_major, tap->minor);
+
+	return 0;
+}
+
+size_t
+blktap_ring_debug(struct blktap *tap, char *buf, size_t size)
+{
+	struct blktap_ring *ring = &tap->ring;
+	char *s = buf, *end = buf + size;
+	int usr_idx;
+
+	s += snprintf(s, end - s,
+		      "begin pending:%d\n", ring->n_pending);
+
+	for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+		struct blktap_request *request;
+		struct timeval *time;
+		char op = '?';
+
+		request = ring->pending[usr_idx];
+		if (!request)
+			continue;
+
+		switch (request->operation) {
+		case BLKIF_OP_WRITE:  op = 'W'; break;
+		case BLKIF_OP_READ:   op = 'R'; break;
+		case BLKIF_OP_PACKET: op = 'P'; break;
+		}
+		time  = &request->time;
+
+		s += snprintf(s, end - s,
+			      "%02d: usr_idx:%02d "
+			      "op:%c nr_pages:%02d time:%lu.%09lu\n",
+			      usr_idx, request->usr_idx,
+			      op, request->nr_pages,
+			      time->tv_sec, time->tv_usec);
+	}
+
+	s += snprintf(s, end - s, "end pending\n");
+
+	return s - buf;
+}
+
+
+int __init
+blktap_ring_init(void)
+{
+	int err;
+
+	err = __register_chrdev(0, 0, CONFIG_XEN_NR_TAP2_DEVICES, "blktap2",
+				&blktap_ring_file_operations);
+	if (err < 0) {
+		BTERR("error registering ring devices: %d\n", err);
+		return err;
+	}
+
+	blktap_ring_major = err;
+	BTINFO("blktap ring major: %d\n", blktap_ring_major);
+
+	return 0;
+}
+
+void
+blktap_ring_exit(void)
+{
+	if (!blktap_ring_major)
+		return;
+
+	__unregister_chrdev(blktap_ring_major, 0, CONFIG_XEN_NR_TAP2_DEVICES,
+			    "blktap2");
+
+	blktap_ring_major = 0;
+}
diff --git a/drivers/xen/blktap2-new/sysfs.c b/drivers/xen/blktap2-new/sysfs.c
new file mode 100644
index 0000000..fd8b27b
--- /dev/null
+++ b/drivers/xen/blktap2-new/sysfs.c
@@ -0,0 +1,299 @@
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_debug_level = 1;
+
+static struct class *class;
+
+static ssize_t
+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct blktap *tap;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return 0;
+
+	if (size > BLKTAP2_MAX_MESSAGE_LEN)
+		return -ENAMETOOLONG;
+
+	if (strnlen(buf, size) >= size)
+		return -EINVAL;
+
+	strlcpy(tap->name, buf, size);
+
+	return size;
+}
+
+static ssize_t
+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct blktap *tap;
+	ssize_t size;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return 0;
+
+	if (tap->name[0])
+		size = sprintf(buf, "%s\n", tap->name);
+	else
+		size = sprintf(buf, "%d\n", tap->minor);
+
+	return size;
+}
+static DEVICE_ATTR(name, S_IRUGO|S_IWUSR,
+		   blktap_sysfs_get_name, blktap_sysfs_set_name);
+
+static void
+blktap_sysfs_remove_work(struct work_struct *work)
+{
+	struct blktap *tap
+		= container_of(work, struct blktap, remove_work);
+	blktap_control_destroy_tap(tap);
+}
+
+static ssize_t
+blktap_sysfs_remove_device(struct device *dev,
+			   struct device_attribute *attr,
+			   const char *buf, size_t size)
+{
+	struct blktap *tap;
+	int err;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return size;
+
+	if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		goto wait;
+
+	if (tap->ring.vma) {
+		struct blkif_sring *sring = tap->ring.ring.sring;
+		sring->private.tapif_user.msg = BLKTAP2_RING_MESSAGE_CLOSE;
+		blktap_ring_kick_user(tap);
+	} else {
+		INIT_WORK(&tap->remove_work, blktap_sysfs_remove_work);
+		schedule_work(&tap->remove_work);
+	}
+wait:
+	err = wait_event_interruptible(tap->remove_wait,
+				       !dev_get_drvdata(dev));
+	if (err)
+		return err;
+
+	return size;
+}
+static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+
+static ssize_t
+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct blktap *tap;
+	char *s = buf, *end = buf + PAGE_SIZE;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return 0;
+
+	s += blktap_control_debug(tap, s, end - s);
+
+	s += blktap_request_debug(tap, s, end - s);
+
+	s += blktap_device_debug(tap, s, end - s);
+
+	s += blktap_ring_debug(tap, s, end - s);
+
+	return s - buf;
+}
+static DEVICE_ATTR(debug, S_IRUGO, blktap_sysfs_debug_device, NULL);
+
+static ssize_t
+blktap_sysfs_show_task(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct blktap *tap;
+	ssize_t rv = 0;
+
+	tap = dev_get_drvdata(dev);
+	if (!tap)
+		return 0;
+
+	if (tap->ring.task)
+		rv = sprintf(buf, "%d\n", tap->ring.task->pid);
+
+	return rv;
+}
+static DEVICE_ATTR(task, S_IRUGO, blktap_sysfs_show_task, NULL);
+
+static ssize_t
+blktap_sysfs_show_pool(struct device *dev,
+		       struct device_attribute *attr,
+		       char *buf)
+{
+	struct blktap *tap = dev_get_drvdata(dev);
+	return sprintf(buf, "%s", kobject_name(&tap->pool->kobj));
+}
+
+static ssize_t
+blktap_sysfs_store_pool(struct device *dev,
+			struct device_attribute *attr,
+			const char *buf, size_t size)
+{
+	struct blktap *tap = dev_get_drvdata(dev);
+	struct blktap_page_pool *pool, *tmp = tap->pool;
+
+	if (tap->device.gd)
+		return -EBUSY;
+
+	pool = blktap_page_pool_get(buf);
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
+
+	tap->pool = pool;
+	kobject_put(&tmp->kobj);
+
+	return size;
+}
+static DEVICE_ATTR(pool, S_IRUSR|S_IWUSR,
+		   blktap_sysfs_show_pool, blktap_sysfs_store_pool);
+
+int
+blktap_sysfs_create(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct device *dev;
+	int err = 0;
+
+	init_waitqueue_head(&tap->remove_wait);
+
+	dev = device_create(class, NULL, ring->devno,
+			    tap, "blktap%d", tap->minor);
+	if (IS_ERR(dev))
+		err = PTR_ERR(dev);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_name);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_remove);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_debug);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_task);
+	if (!err)
+		err = device_create_file(dev, &dev_attr_pool);
+	if (!err)
+		ring->dev = dev;
+	else
+		device_unregister(dev);
+
+	return err;
+}
+
+void
+blktap_sysfs_destroy(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	struct device *dev;
+
+	dev = ring->dev;
+
+	if (!dev)
+		return;
+
+	dev_set_drvdata(dev, NULL);
+	wake_up(&tap->remove_wait);
+
+	device_unregister(dev);
+	ring->dev = NULL;
+}
+
+static ssize_t
+blktap_sysfs_show_verbosity(struct class *class, struct class_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%d\n", blktap_debug_level);
+}
+
+static ssize_t
+blktap_sysfs_set_verbosity(struct class *class, struct class_attribute *attr,
+			   const char *buf, size_t size)
+{
+	int level;
+
+	if (sscanf(buf, "%d", &level) == 1) {
+		blktap_debug_level = level;
+		return size;
+	}
+
+	return -EINVAL;
+}
+static CLASS_ATTR(verbosity, S_IRUGO|S_IWUSR,
+		  blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
+
+static ssize_t
+blktap_sysfs_show_devices(struct class *class, struct class_attribute *attr,
+			  char *buf)
+{
+	int i, ret;
+	struct blktap *tap;
+
+	mutex_lock(&blktap_lock);
+
+	ret = 0;
+	for (i = 0; i < blktap_max_minor; i++) {
+		tap = blktaps[i];
+		if (!tap)
+			continue;
+
+		if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+			continue;
+
+		ret += sprintf(buf + ret, "%d %s\n", tap->minor, tap->name);
+	}
+
+	mutex_unlock(&blktap_lock);
+
+	return ret;
+}
+static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL);
+
+static char *blktap_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "blktap%u",
+			 MINOR(dev->devt));
+}
+
+void
+blktap_sysfs_exit(void)
+{
+	if (class)
+		class_destroy(class);
+}
+
+int __init
+blktap_sysfs_init(void)
+{
+	struct class *cls;
+	int err = 0;
+
+	cls = class_create(THIS_MODULE, "blktap2");
+	if (IS_ERR(cls))
+		err = PTR_ERR(cls);
+	else
+		cls->devnode = blktap_devnode;
+	if (!err)
+		err = class_create_file(cls, &class_attr_verbosity);
+	if (!err)
+		err = class_create_file(cls, &class_attr_devices);
+	if (!err)
+		class = cls;
+	else
+		class_destroy(cls);
+
+	return err;
+}
diff --git a/drivers/xen/blktap2/Makefile b/drivers/xen/blktap2/Makefile
new file mode 100644
index 0000000..8bb330c
--- /dev/null
+++ b/drivers/xen/blktap2/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_BLKDEV_TAP2) := blktap2.o
+
+blktap2-y := control.o ring.o wait_queue.o device.o request.o
+blktap2-$(CONFIG_SYSFS) += sysfs.o
diff --git a/drivers/xen/blktap2/blktap.h b/drivers/xen/blktap2/blktap.h
new file mode 100644
index 0000000..4726348
--- /dev/null
+++ b/drivers/xen/blktap2/blktap.h
@@ -0,0 +1,264 @@
+#ifndef _BLKTAP_H_
+#define _BLKTAP_H_
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <xen/blkif.h>
+#include <xen/gnttab.h>
+
+//#define ENABLE_PASSTHROUGH
+
+extern int blktap_debug_level;
+
+#define BTPRINTK(level, tag, force, _f, _a...)				\
+	do {								\
+		if (blktap_debug_level > level &&			\
+		    (force || printk_ratelimit()))			\
+			printk(tag "%s: " _f, __func__, ##_a);		\
+	} while (0)
+
+#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
+#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
+#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
+#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
+
+#define BLKTAP2_DEV_DIR "xen/blktap-2/"
+
+#define BLKTAP_CONTROL               1
+#define BLKTAP_RING_FD               2
+#define BLKTAP_RING_VMA              3
+#define BLKTAP_DEVICE                4
+#define BLKTAP_SYSFS                 5
+#define BLKTAP_PAUSE_REQUESTED       6
+#define BLKTAP_PAUSED                7
+#define BLKTAP_SHUTDOWN_REQUESTED    8
+#define BLKTAP_PASSTHROUGH           9
+#define BLKTAP_DEFERRED              10
+
+/* blktap IOCTLs: */
+#define BLKTAP2_IOCTL_KICK_FE        1
+#define BLKTAP2_IOCTL_ALLOC_TAP	     200
+#define BLKTAP2_IOCTL_FREE_TAP       201
+#define BLKTAP2_IOCTL_CREATE_DEVICE  202
+#define BLKTAP2_IOCTL_SET_PARAMS     203
+#define BLKTAP2_IOCTL_PAUSE          204
+#define BLKTAP2_IOCTL_REOPEN         205
+#define BLKTAP2_IOCTL_RESUME         206
+
+#define BLKTAP2_MAX_MESSAGE_LEN      256
+
+#define BLKTAP2_RING_MESSAGE_PAUSE   1
+#define BLKTAP2_RING_MESSAGE_RESUME  2
+#define BLKTAP2_RING_MESSAGE_CLOSE   3
+
+#define BLKTAP_REQUEST_FREE          0
+#define BLKTAP_REQUEST_PENDING       1
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE		__CONST_RING_SIZE(blkif, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM		BLK_RING_SIZE
+#define MAX_PENDING_REQS	BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req, _seg)					\
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+
+#define blktap_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blktap_put(_b)					\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->refcnt))	\
+			wake_up(&(_b)->wq);		\
+	} while (0)
+
+struct blktap;
+
+struct grant_handle_pair {
+	grant_handle_t                 kernel;
+	grant_handle_t                 user;
+};
+#define INVALID_GRANT_HANDLE           0xFFFF
+
+struct blktap_handle {
+	unsigned int                   ring;
+	unsigned int                   device;
+	unsigned int                   minor;
+};
+
+struct blktap_params {
+	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+	unsigned long long             capacity;
+	unsigned long                  sector_size;
+};
+
+struct blktap_device {
+	int                            users;
+	spinlock_t                     lock;
+	struct gendisk                *gd;
+
+#ifdef ENABLE_PASSTHROUGH
+	struct block_device           *bdev;
+#endif
+};
+
+struct blktap_ring {
+	struct vm_area_struct         *vma;
+	blkif_front_ring_t             ring;
+	struct vm_foreign_map          foreign_map;
+	unsigned long                  ring_vstart;
+	unsigned long                  user_vstart;
+
+	int                            response;
+
+	wait_queue_head_t              poll_wait;
+
+	dev_t                          devno;
+	struct device                 *dev;
+	atomic_t                       sysfs_refcnt;
+	struct mutex                   sysfs_mutex;
+};
+
+struct blktap_statistics {
+	unsigned long                  st_print;
+	int                            st_rd_req;
+	int                            st_wr_req;
+	int                            st_oo_req;
+	int                            st_pk_req;
+	int                            st_rd_sect;
+	int                            st_wr_sect;
+	s64                            st_rd_cnt;
+	s64                            st_rd_sum_usecs;
+	s64                            st_rd_max_usecs;
+	s64                            st_wr_cnt;
+	s64                            st_wr_sum_usecs;
+	s64                            st_wr_max_usecs;	
+};
+
+struct blktap_request {
+	uint64_t                       id;
+	uint16_t                       usr_idx;
+
+	uint8_t                        status;
+	atomic_t                       pendcnt;
+	uint8_t                        nr_pages;
+	unsigned short                 operation;
+
+	struct timeval                 time;
+	struct grant_handle_pair       handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct list_head               free_list;
+};
+
+struct blktap {
+	int                            minor;
+	pid_t                          pid;
+	atomic_t                       refcnt;
+	unsigned long                  dev_inuse;
+
+	struct blktap_params           params;
+
+	struct rw_semaphore            tap_sem;
+
+	struct blktap_ring             ring;
+	struct blktap_device           device;
+
+	int                            pending_cnt;
+	struct blktap_request         *pending_requests[MAX_PENDING_REQS];
+	struct scatterlist             sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+
+	wait_queue_head_t              wq;
+	struct list_head               deferred_queue;
+
+	struct blktap_statistics       stats;
+};
+
+extern struct blktap *blktaps[];
+
+static inline int
+blktap_active(struct blktap *tap)
+{
+	return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
+}
+
+static inline int
+blktap_validate_params(struct blktap *tap, struct blktap_params *params)
+{
+	/* TODO: sanity check */
+	params->name[sizeof(params->name) - 1] = '\0';
+	BTINFO("%s: capacity: %llu, sector-size: %lu\n",
+	       params->name, params->capacity, params->sector_size);
+	return 0;
+}
+
+int blktap_control_destroy_device(struct blktap *);
+int blktap_control_finish_destroy(struct blktap *);
+
+int blktap_ring_init(int *);
+int blktap_ring_free(void);
+int blktap_ring_create(struct blktap *);
+int blktap_ring_destroy(struct blktap *);
+int blktap_ring_pause(struct blktap *);
+int blktap_ring_resume(struct blktap *);
+void blktap_ring_kick_user(struct blktap *);
+
+#ifdef CONFIG_SYSFS
+int blktap_sysfs_init(void);
+void blktap_sysfs_free(void);
+int blktap_sysfs_create(struct blktap *);
+int blktap_sysfs_destroy(struct blktap *);
+#else
+static inline int blktap_sysfs_init(void) { return 0; }
+static inline void blktap_sysfs_exit(void) {}
+static inline int blktap_sysfs_create(struct blktap *tapdev) { return 0; }
+static inline int blktap_sysfs_destroy(struct blktap *tapdev) { return 0; }
+#endif
+
+int blktap_device_init(int *);
+void blktap_device_free(void);
+int blktap_device_create(struct blktap *);
+int blktap_device_destroy(struct blktap *);
+int blktap_device_pause(struct blktap *);
+int blktap_device_resume(struct blktap *);
+void blktap_device_restart(struct blktap *);
+void blktap_device_finish_request(struct blktap *,
+				  blkif_response_t *,
+				  struct blktap_request *);
+void blktap_device_fail_pending_requests(struct blktap *);
+#ifdef ENABLE_PASSTHROUGH
+int blktap_device_enable_passthrough(struct blktap *,
+				     unsigned, unsigned);
+#endif
+
+void blktap_defer(struct blktap *);
+void blktap_run_deferred(void);
+
+int blktap_request_pool_init(void);
+void blktap_request_pool_free(void);
+int blktap_request_pool_grow(void);
+int blktap_request_pool_shrink(void);
+struct blktap_request *blktap_request_allocate(struct blktap *);
+void blktap_request_free(struct blktap *, struct blktap_request *);
+struct page *request_to_page(struct blktap_request *, int);
+
+static inline unsigned long
+request_to_kaddr(struct blktap_request *req, int seg)
+{
+	unsigned long pfn = page_to_pfn(request_to_page(req, seg));
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#endif
diff --git a/drivers/xen/blktap2/control.c b/drivers/xen/blktap2/control.c
new file mode 100644
index 0000000..f447143
--- /dev/null
+++ b/drivers/xen/blktap2/control.c
@@ -0,0 +1,285 @@
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+
+#include "blktap.h"
+
+static DEFINE_SPINLOCK(blktap_control_lock);
+struct blktap *blktaps[CONFIG_XEN_NR_TAP2_DEVICES];
+
+static int ring_major;
+static int device_major;
+static int blktap_control_registered;
+
+static void
+blktap_control_initialize_tap(struct blktap *tap)
+{
+	int minor = tap->minor;
+
+	memset(tap, 0, sizeof(*tap));
+	set_bit(BLKTAP_CONTROL, &tap->dev_inuse);
+	init_rwsem(&tap->tap_sem);
+	sg_init_table(tap->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	init_waitqueue_head(&tap->wq);
+	atomic_set(&tap->refcnt, 0);
+
+	tap->minor = minor;
+}
+
+static struct blktap *
+blktap_control_create_tap(void)
+{
+	int minor;
+	struct blktap *tap;
+
+	tap = kmalloc(sizeof(*tap), GFP_KERNEL);
+	if (unlikely(!tap))
+		return NULL;
+
+	blktap_control_initialize_tap(tap);
+
+	spin_lock_irq(&blktap_control_lock);
+	for (minor = 0; minor < CONFIG_XEN_NR_TAP2_DEVICES; minor++)
+		if (!blktaps[minor])
+			break;
+
+	if (minor == CONFIG_XEN_NR_TAP2_DEVICES) {
+		kfree(tap);
+		tap = NULL;
+		goto out;
+	}
+
+	tap->minor = minor;
+	blktaps[minor] = tap;
+
+out:
+	spin_unlock_irq(&blktap_control_lock);
+	return tap;
+}
+
+static struct blktap *
+blktap_control_allocate_tap(void)
+{
+	int err, minor;
+	struct blktap *tap;
+
+	/*
+	 * This is called only from the ioctl, which
+	 * means we should always have interrupts enabled.
+	 */
+	BUG_ON(irqs_disabled());
+
+	spin_lock_irq(&blktap_control_lock);
+
+	for (minor = 0; minor < CONFIG_XEN_NR_TAP2_DEVICES; minor++) {
+		tap = blktaps[minor];
+		if (!tap)
+			goto found;
+
+		if (!tap->dev_inuse) {
+			blktap_control_initialize_tap(tap);
+			goto found;
+		}
+	}
+
+	tap = NULL;
+
+found:
+	spin_unlock_irq(&blktap_control_lock);
+
+	if (!tap) {
+		tap = blktap_control_create_tap();
+		if (!tap)
+			return NULL;
+	}
+
+	err = blktap_ring_create(tap);
+	if (err) {
+		BTERR("ring creation failed: %d\n", err);
+		clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
+		return NULL;
+	}
+
+	BTINFO("allocated tap %p\n", tap);
+	return tap;
+}
+
+static long
+blktap_control_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	unsigned long dev;
+	struct blktap *tap;
+
+	switch (cmd) {
+	case BLKTAP2_IOCTL_ALLOC_TAP: {
+		struct blktap_handle h;
+
+		tap = blktap_control_allocate_tap();
+		if (!tap) {
+			BTERR("error allocating device\n");
+			return -ENOMEM;
+		}
+
+		h.ring   = ring_major;
+		h.device = device_major;
+		h.minor  = tap->minor;
+
+		if (copy_to_user((struct blktap_handle __user *)arg,
+				 &h, sizeof(h))) {
+			blktap_control_destroy_device(tap);
+			return -EFAULT;
+		}
+
+		return 0;
+	}
+
+	case BLKTAP2_IOCTL_FREE_TAP:
+		dev = arg;
+
+		if (dev >= CONFIG_XEN_NR_TAP2_DEVICES || !blktaps[dev])
+			return -EINVAL;
+
+		blktap_control_destroy_device(blktaps[dev]);
+		return 0;
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static const struct file_operations blktap_control_file_operations = {
+	.owner    = THIS_MODULE,
+	.unlocked_ioctl = blktap_control_ioctl,
+};
+
+static struct miscdevice blktap_misc = {
+	.minor    = MISC_DYNAMIC_MINOR,
+	.name     = "blktap-control",
+	.nodename = BLKTAP2_DEV_DIR "control",
+	.fops     = &blktap_control_file_operations,
+};
+
+int
+blktap_control_destroy_device(struct blktap *tap)
+{
+	int err;
+	unsigned long inuse;
+
+	if (!tap)
+		return 0;
+
+	set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
+
+	for (;;) {
+		inuse = tap->dev_inuse;
+		err   = blktap_device_destroy(tap);
+		if (err)
+			goto wait;
+
+		inuse = tap->dev_inuse;
+		err   = blktap_ring_destroy(tap);
+		if (err)
+			goto wait;
+
+		inuse = tap->dev_inuse;
+		err   = blktap_sysfs_destroy(tap);
+		if (err)
+			goto wait;
+
+		break;
+
+	wait:
+		BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n",
+		      inuse, tap->dev_inuse);
+		if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse))
+			break;
+	}
+
+	clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
+
+	if (blktap_control_finish_destroy(tap))
+		err = 0;
+
+	return err;
+}
+
+int
+blktap_control_finish_destroy(struct blktap *tap)
+{
+	if (tap->dev_inuse == (1UL << BLKTAP_CONTROL))
+		clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
+	return !tap->dev_inuse;
+}
+
+static int __init
+blktap_control_init(void)
+{
+	int err;
+
+	err = misc_register(&blktap_misc);
+	if (err) {
+		BTERR("misc_register failed for control device");
+		return err;
+	}
+
+	blktap_control_registered = 1;
+	return 0;
+}
+
+static void
+blktap_control_free(void)
+{
+	int i;
+
+	for (i = 0; i < CONFIG_XEN_NR_TAP2_DEVICES; i++)
+		blktap_control_destroy_device(blktaps[i]);
+
+	if (blktap_control_registered)
+		if (misc_deregister(&blktap_misc) < 0)
+			BTERR("misc_deregister failed for control device");
+}
+
+static void
+blktap_exit(void)
+{
+	blktap_control_free();
+	blktap_ring_free();
+	blktap_sysfs_free();
+	blktap_device_free();
+	blktap_request_pool_free();
+}
+
+static int __init
+blktap_init(void)
+{
+	int err;
+
+	err = blktap_request_pool_init();
+	if (err)
+		return err;
+
+	err = blktap_device_init(&device_major);
+	if (err)
+		goto fail;
+
+	err = blktap_ring_init(&ring_major);
+	if (err)
+		goto fail;
+
+	err = blktap_sysfs_init();
+	if (err)
+		goto fail;
+
+	err = blktap_control_init();
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	blktap_exit();
+	return err;
+}
+
+module_init(blktap_init);
+module_exit(blktap_exit);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("devname:" BLKTAP2_DEV_DIR "control");
diff --git a/drivers/xen/blktap2/device.c b/drivers/xen/blktap2/device.c
new file mode 100644
index 0000000..4b8adaa
--- /dev/null
+++ b/drivers/xen/blktap2/device.c
@@ -0,0 +1,1176 @@
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+#include <scsi/scsi.h>
+#include <asm/tlbflush.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include <xen/xenbus.h>
+#include <xen/interface/io/blkif.h>
+
+#include "blktap.h"
+
+#include "../blkback/blkback-pagemap.h"
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) pr_alert(_f, ## _a)
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+struct blktap_grant_table {
+	int cnt;
+	struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
+};
+
+static int blktap_device_major;
+
+static inline struct blktap *
+dev_to_blktap(struct blktap_device *dev)
+{
+	return container_of(dev, struct blktap, device);
+}
+
+static int
+blktap_device_open(struct block_device *bd, fmode_t mode)
+{
+	struct blktap *tap;
+	struct blktap_device *dev = bd->bd_disk->private_data;
+
+	if (!dev)
+		return -ENOENT;
+
+	tap = dev_to_blktap(dev);
+	if (!blktap_active(tap) ||
+	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		return -ENOENT;
+
+	dev->users++;
+
+	return 0;
+}
+
+static int
+blktap_device_release(struct gendisk *disk, fmode_t mode)
+{
+	struct blktap_device *dev = disk->private_data;
+	struct blktap *tap = dev_to_blktap(dev);
+
+	dev->users--;
+	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		blktap_device_destroy(tap);
+
+	return 0;
+}
+
+static int
+blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+	/* We don't have real geometry info, but let's at least return
+	   values consistent with the size of the device */
+	sector_t nsect = get_capacity(bd->bd_disk);
+	sector_t cylinders = nsect;
+
+	hg->heads = 0xff;
+	hg->sectors = 0x3f;
+	sector_div(cylinders, hg->heads * hg->sectors);
+	hg->cylinders = cylinders;
+	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+		hg->cylinders = 0xffff;
+	return 0;
+}
+
+static int
+blktap_device_ioctl(struct block_device *bd, fmode_t mode,
+		    unsigned command, unsigned long argument)
+{
+	int i;
+
+	DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx\n",
+		      command, (long)argument);
+
+	switch (command) {
+	case CDROMMULTISESSION:
+		BTDBG("FIXME: support multisession CDs later\n");
+		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+			if (put_user(0, (char __user *)(argument + i)))
+				return -EFAULT;
+		return 0;
+
+	case SCSI_IOCTL_GET_IDLUN:
+		if (!access_ok(VERIFY_WRITE, argument, 
+			sizeof(struct scsi_idlun)))
+			return -EFAULT;
+
+		/* return 0 for now. */
+		__put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+		__put_user(0, 
+			&((struct scsi_idlun __user *)argument)->host_unique_id);
+		return 0;
+
+	default:
+		return -EINVAL; /* same return as native Linux */
+	}
+
+	return 0;
+}
+
+static const struct block_device_operations blktap_device_file_operations = {
+	.owner     = THIS_MODULE,
+	.open      = blktap_device_open,
+	.release   = blktap_device_release,
+	.ioctl     = blktap_device_ioctl,
+	.getgeo    = blktap_device_getgeo
+};
+
+static int
+blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
+		    unsigned long addr, void *data)
+{
+	pte_t *pte = (pte_t *)data;
+
+	BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte));
+	set_pte(ptep, *pte);
+	return 0;
+}
+
+static int
+blktap_map_uaddr(struct vm_area_struct *vma, unsigned long address, pte_t pte)
+{
+	return apply_to_page_range(vma ? vma->vm_mm : NULL, address,
+				   PAGE_SIZE, blktap_map_uaddr_fn, &pte);
+}
+
+static int
+blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
+		     unsigned long addr, void *data)
+{
+	struct vm_area_struct *vma = data;
+
+	BTDBG("ptep %p\n", ptep);
+	xen_ptep_get_and_clear_full(vma, addr, ptep, 1);
+	return 0;
+}
+
+static int
+blktap_umap_uaddr(struct vm_area_struct *vma, unsigned long address)
+{
+	struct mm_struct *mm = NULL;
+
+	if (!vma) {
+#ifdef CONFIG_X86
+		if (HYPERVISOR_update_va_mapping(address, __pte(0),
+						 UVMF_INVLPG|UVMF_ALL))
+			BUG();
+		return 1;
+#endif
+	} else
+		mm = vma->vm_mm;
+	return apply_to_page_range(mm, address,
+				   PAGE_SIZE, blktap_umap_uaddr_fn, vma);
+}
+
+static inline void
+flush_tlb_kernel_page(unsigned long kvaddr)
+{
+#ifdef CONFIG_X86
+	xen_invlpg_all(kvaddr);
+#else
+	flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE);
+#endif
+}
+
+/*
+ * tap->tap_sem held on entry
+ */
+static void
+blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
+{
+	uint64_t ptep;
+	int ret, usr_idx;
+	unsigned int i, cnt;
+	struct page **map, *page;
+	struct blktap_ring *ring;
+	struct grant_handle_pair *khandle;
+	unsigned long kvaddr, uvaddr, offset;
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
+	grant_handle_t self_gref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	int self_gref_nr = 0;
+
+	cnt     = 0;
+	ring    = &tap->ring;
+	usr_idx = request->usr_idx;
+	map     = ring->foreign_map.map;
+
+	if (!ring->vma)
+		return;
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		zap_page_range(ring->vma, 
+			       MMAP_VADDR(ring->user_vstart, usr_idx, 0),
+			       request->nr_pages << PAGE_SHIFT, NULL);
+
+	for (i = 0; i < request->nr_pages; i++) {
+		kvaddr = request_to_kaddr(request, i);
+		uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
+
+		khandle = request->handles + i;
+
+		if (khandle->kernel != INVALID_GRANT_HANDLE) {
+			gnttab_set_unmap_op(&unmap[cnt], kvaddr,
+					    GNTMAP_host_map, khandle->kernel);
+			cnt++;
+			set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
+					    INVALID_P2M_ENTRY);
+		}
+
+		if (khandle->user != INVALID_GRANT_HANDLE) {
+			BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+			if (create_lookup_pte_addr(ring->vma->vm_mm,
+						   uvaddr, &ptep) != 0) {
+				BTERR("Couldn't get a pte addr!\n");
+				return;
+			}
+
+			gnttab_set_unmap_op(&unmap[cnt], ptep,
+					    GNTMAP_host_map
+					    | GNTMAP_application_map
+					    | GNTMAP_contains_pte,
+					    khandle->user);
+			cnt++;
+		}
+
+		offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
+
+		BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
+		      "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
+		      "0x%08lx, handle: %u\n", offset, map[offset], request,
+		      usr_idx, i, kvaddr, khandle->kernel, uvaddr,
+		      khandle->user);
+
+		page = map[offset];
+		if (page) {
+			if (PageBlkback(page)) {
+				ClearPageBlkback(page);
+				set_page_private(page, 0);
+			} else if (
+				xen_feature(XENFEAT_auto_translated_physmap)) {
+				self_gref[self_gref_nr] = khandle->kernel;
+				self_gref_nr++;
+			}
+		}
+		map[offset] = NULL;
+
+		khandle->kernel = INVALID_GRANT_HANDLE;
+		khandle->user   = INVALID_GRANT_HANDLE;
+	}
+
+	if (cnt) {
+		ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+						unmap, cnt);
+		BUG_ON(ret);
+	}
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		zap_page_range(ring->vma, 
+			       MMAP_VADDR(ring->user_vstart, usr_idx, 0), 
+			       request->nr_pages << PAGE_SHIFT, NULL);
+	else {
+		for (i = 0; i < self_gref_nr; i++) {
+			gnttab_end_foreign_access_ref(self_gref[i]);
+		}
+	}
+}
+
+/*
+ * tap->tap_sem held on entry
+ */
+static void
+blktap_unmap(struct blktap *tap, struct blktap_request *request)
+{
+	int i, usr_idx;
+	unsigned long kvaddr;
+
+	usr_idx = request->usr_idx;
+	down_write(&tap->ring.vma->vm_mm->mmap_sem);
+
+	for (i = 0; i < request->nr_pages; i++) {
+		kvaddr = request_to_kaddr(request, i);
+		BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
+		      "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
+		      kvaddr, request->handles[i].kernel,
+		      MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
+		      request->handles[i].user);
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap) &&
+		    request->handles[i].kernel == INVALID_GRANT_HANDLE) {
+			if (blktap_umap_uaddr(NULL, kvaddr) == 0)
+				flush_tlb_kernel_page(kvaddr);
+			set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
+					    INVALID_P2M_ENTRY);
+		}
+	}
+
+	blktap_device_fast_flush(tap, request);
+	up_write(&tap->ring.vma->vm_mm->mmap_sem);
+}
+
+/*
+ * called if the tapdisk process dies unexpectedly.
+ * fail and release any pending requests and disable queue.
+ */
+void
+blktap_device_fail_pending_requests(struct blktap *tap)
+{
+	int usr_idx;
+	struct request *req;
+	struct blktap_device *dev;
+	struct blktap_request *request;
+
+	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+		return;
+
+	down_write(&tap->tap_sem);
+
+	dev = &tap->device;
+	for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+		request = tap->pending_requests[usr_idx];
+		if (!request || request->status != BLKTAP_REQUEST_PENDING)
+			continue;
+
+		BTERR("%u:%u: failing pending %s of %d pages\n",
+		      blktap_device_major, tap->minor,
+		      (request->operation == BLKIF_OP_PACKET ?
+		       "packet" : request->operation == BLKIF_OP_READ ?
+		       "read" : "write"), request->nr_pages);
+
+		blktap_unmap(tap, request);
+		req = (struct request *)(unsigned long)request->id;
+		blk_end_request_all(req, -ENODEV);
+		blktap_request_free(tap, request);
+	}
+
+	up_write(&tap->tap_sem);
+
+	spin_lock_irq(&dev->lock);
+
+	/* fail any future requests */
+	dev->gd->queue->queuedata = NULL;
+	blk_start_queue(dev->gd->queue);
+
+	spin_unlock_irq(&dev->lock);
+}
+
+/*
+ * tap->tap_sem held on entry
+ */
+void
+blktap_device_finish_request(struct blktap *tap,
+			     blkif_response_t *res,
+			     struct blktap_request *request)
+{
+	struct request *req;
+
+	blktap_unmap(tap, request);
+
+	req = (struct request *)(unsigned long)request->id;
+
+	BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
+	      res->status, res->operation, request->operation,
+	      (unsigned long long)res->id);
+
+	switch (request->operation) {
+	case BLKIF_OP_READ:
+	case BLKIF_OP_WRITE:
+	case BLKIF_OP_PACKET:
+		if (unlikely(res->status != BLKIF_RSP_OKAY))
+			BTERR("Bad return from device data "
+				"request: %x\n", res->status);
+		blk_end_request_all(req,
+			res->status == BLKIF_RSP_OKAY ? 0 : -EIO);
+		break;
+	default:
+		BUG();
+	}
+
+	blktap_request_free(tap, request);
+}
+
+static int
+blktap_prep_foreign(struct blktap *tap,
+		    struct blktap_request *request,
+		    blkif_request_t *blkif_req,
+		    unsigned int seg, struct page *page,
+		    struct blktap_grant_table *table)
+{
+	uint64_t ptep;
+	uint32_t flags;
+	struct page *tap_page;
+	struct blktap_ring *ring;
+	struct blkback_pagemap map;
+	unsigned long uvaddr, kvaddr;
+
+	ring = &tap->ring;
+	map  = blkback_pagemap_read(page);
+	blkif_req->seg[seg].gref = map.gref;
+
+	uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
+	kvaddr = request_to_kaddr(request, seg);
+	flags  = GNTMAP_host_map |
+		(request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
+
+	gnttab_set_map_op(&table->grants[table->cnt],
+			  kvaddr, flags, map.gref, map.domid);
+	table->cnt++;
+
+	/* enable chained tap devices */
+	tap_page = request_to_page(request, seg);
+	set_page_private(tap_page, page_private(page));
+	SetPageBlkback(tap_page);
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
+	if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
+		BTERR("couldn't get a pte addr!\n");
+		return -1;
+	}
+
+	flags |= GNTMAP_application_map | GNTMAP_contains_pte;
+	gnttab_set_map_op(&table->grants[table->cnt],
+			  ptep, flags, map.gref, map.domid);
+	table->cnt++;
+
+	return 0;
+}
+
+static int
+blktap_map_foreign(struct blktap *tap,
+		   struct blktap_request *request,
+		   blkif_request_t *blkif_req,
+		   struct blktap_grant_table *table)
+{
+	struct page *page;
+	int i, grant, err, usr_idx;
+	struct blktap_ring *ring;
+	unsigned long uvaddr, foreign_mfn;
+
+	if (!table->cnt)
+		return 0;
+
+	err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+					table->grants, table->cnt);
+	BUG_ON(err);
+
+	grant   = 0;
+	usr_idx = request->usr_idx;
+	ring    = &tap->ring;
+
+	for (i = 0; i < request->nr_pages; i++) {
+		if (!blkif_req->seg[i].gref)
+			continue;
+
+		uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
+
+		if (unlikely(table->grants[grant].status != GNTST_okay)) {
+			BTERR("invalid kernel buffer: could not remap it\n");
+			/* This should never happen: blkback should handle eagain first */
+			BUG_ON(table->grants[grant].status == GNTST_eagain);
+			err |= 1;
+			table->grants[grant].handle = INVALID_GRANT_HANDLE;
+		}
+
+		request->handles[i].kernel = table->grants[grant].handle;
+		foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
+		grant++;
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			if (unlikely(table->grants[grant].status != GNTST_okay)) {
+				/* This should never happen: blkback should handle eagain first */
+				WARN_ON(table->grants[grant].status == GNTST_eagain);
+				BTERR("invalid user buffer: could not remap it\n");
+				err |= 1;
+				table->grants[grant].handle = INVALID_GRANT_HANDLE;
+			}
+			request->handles[i].user = table->grants[grant].handle;
+			grant++;
+		}
+
+		if (err)
+			continue;
+
+		page = request_to_page(request, i);
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap))
+			set_phys_to_machine(page_to_pfn(page),
+					    FOREIGN_FRAME(foreign_mfn));
+		else if (vm_insert_page(ring->vma, uvaddr, page))
+			err |= 1;
+
+		BTDBG("pending_req: %p, seg: %d, page: %p, "
+		      "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, "
+		      "uhandle: %u\n", request, i, page,
+		      pfn_to_kaddr(page_to_pfn(page)),
+		      request->handles[i].kernel,
+		      uvaddr, request->handles[i].user);
+	}
+
+	return err;
+}
+
+static int
+blktap_map(struct blktap *tap,
+	   struct blktap_request *request,
+	   unsigned int seg, struct page *page)
+{
+	pte_t pte;
+	int usr_idx;
+	struct blktap_ring *ring;
+	unsigned long uvaddr, kvaddr;
+	int err = 0;
+
+	ring    = &tap->ring;
+	usr_idx = request->usr_idx;
+	uvaddr  = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
+	kvaddr  = request_to_kaddr(request, seg);
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		pte = mk_pte(page, ring->vma->vm_page_prot);
+		blktap_map_uaddr(ring->vma, uvaddr,
+				 pte_mkspecial(pte_mkwrite(pte)));
+		flush_tlb_page(ring->vma, uvaddr);
+		blktap_map_uaddr(NULL, kvaddr, mk_pte(page, PAGE_KERNEL));
+		flush_tlb_kernel_page(kvaddr);
+
+		set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
+		request->handles[seg].kernel = INVALID_GRANT_HANDLE;
+	} else {
+		/* grant this page access to self domain and map it. */
+		domid_t domid = 0; /* XXX my domian id: grant table hypercall
+				      doesn't understand DOMID_SELF */
+		int gref;
+		uint32_t flags;
+		struct gnttab_map_grant_ref map;
+		struct page *tap_page;
+
+		gref = gnttab_grant_foreign_access(
+			domid, page_to_pfn(page),
+			(request->operation == BLKIF_OP_WRITE)?
+			GTF_readonly: 0);
+
+		flags  = GNTMAP_host_map |
+			(request->operation == BLKIF_OP_WRITE ?
+			 GNTMAP_readonly : 0);
+
+		gnttab_set_map_op(&map, kvaddr, flags, gref, domid);
+
+		/* enable chained tap devices */
+		tap_page = request_to_page(request, seg);
+		set_page_private(tap_page, page_private(page));
+		SetPageBlkback(tap_page);
+
+		gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map);
+
+		/* We are not expecting the grant op to fail */
+		BUG_ON(map.status != GNTST_okay);
+
+		err = vm_insert_page(ring->vma, uvaddr, tap_page);
+		if (err) {
+			struct gnttab_unmap_grant_ref unmap;
+			gnttab_set_unmap_op(&unmap, kvaddr,
+					    GNTMAP_host_map, gref);
+			VOID(HYPERVISOR_grant_table_op(
+				GNTTABOP_unmap_grant_ref, &unmap, 1));
+		} else
+			request->handles[seg].kernel = gref;
+	}
+	request->handles[seg].user = INVALID_GRANT_HANDLE;
+
+	BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
+	      "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
+	      uvaddr);
+
+	return err;
+}
+
+static int
+blktap_device_process_request(struct blktap *tap,
+			      struct blktap_request *request,
+			      struct request *req)
+{
+	struct page *page;
+	int i, usr_idx, err;
+	struct blktap_ring *ring;
+	struct scatterlist *sg;
+	struct blktap_grant_table table;
+	unsigned int fsect, lsect, nr_sects;
+	unsigned long offset, uvaddr;
+	struct blkif_request blkif_req, *target;
+
+	err = -1;
+	memset(&table, 0, sizeof(table));
+
+	if (!blktap_active(tap))
+		goto out;
+
+	ring    = &tap->ring;
+	usr_idx = request->usr_idx;
+	blkif_req.id = usr_idx;
+	blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req);
+	blkif_req.handle = 0;
+	blkif_req.operation = rq_data_dir(req) ?
+		BLKIF_OP_WRITE : BLKIF_OP_READ;
+	if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
+		blkif_req.operation = BLKIF_OP_PACKET;
+
+	request->id        = (unsigned long)req;
+	request->operation = blkif_req.operation;
+	request->status    = BLKTAP_REQUEST_PENDING;
+	do_gettimeofday(&request->time);
+
+	nr_sects = 0;
+	request->nr_pages = 0;
+	blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg);
+	BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	for_each_sg(tap->sg, sg, blkif_req.nr_segments, i) {
+			fsect = sg->offset >> 9;
+			lsect = fsect + (sg->length >> 9) - 1;
+			nr_sects += sg->length >> 9;
+
+			blkif_req.seg[i] =
+				(struct blkif_request_segment) {
+				.gref       = 0,
+				.first_sect = fsect,
+				.last_sect  = lsect };
+
+			if (PageBlkback(sg_page(sg))) {
+				/* foreign page -- use xen */
+				if (blktap_prep_foreign(tap,
+							request,
+							&blkif_req,
+							i,
+							sg_page(sg),
+							&table))
+					goto out;
+			} else {
+				/* do it the old fashioned way */
+				if (blktap_map(tap,
+					       request,
+					       i,
+					       sg_page(sg)))
+					goto out;
+			}
+
+			uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
+			offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
+			page   = request_to_page(request, i);
+			ring->foreign_map.map[offset] = page;
+			SetPageReserved(page);
+
+			BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
+			      uvaddr, page, page_to_pfn(page));
+			BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
+			      "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n",
+			      offset, request, i,
+			      page, pfn_to_kaddr(page_to_pfn(page)), uvaddr);
+
+			request->nr_pages++;
+	}
+
+	if (blktap_map_foreign(tap, request, &blkif_req, &table))
+		goto out;
+
+	/* Finally, write the request message to the user ring. */
+	target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
+	memcpy(target, &blkif_req, sizeof(blkif_req));
+	target->id = request->usr_idx;
+	wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
+	ring->ring.req_prod_pvt++;
+
+	if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
+		tap->stats.st_pk_req++;
+	else if (rq_data_dir(req)) {
+		tap->stats.st_wr_sect += nr_sects;
+		tap->stats.st_wr_req++;
+	} else {
+		tap->stats.st_rd_sect += nr_sects;
+		tap->stats.st_rd_req++;
+	}
+
+	err = 0;
+
+out:
+	if (err)
+		blktap_device_fast_flush(tap, request);
+	return err;
+}
+
+#ifdef ENABLE_PASSTHROUGH
+#define rq_for_each_bio_safe(_bio, _tmp, _req)				\
+	if ((_req)->bio)						\
+		for (_bio = (_req)->bio;				\
+		     _bio && ((_tmp = _bio->bi_next) || 1);		\
+		     _bio = _tmp)
+
+static void
+blktap_device_forward_request(struct blktap *tap, struct request *req)
+{
+	struct bio *bio, *tmp;
+	struct blktap_device *dev;
+
+	dev = &tap->device;
+
+	rq_for_each_bio_safe(bio, tmp, req) {
+		bio->bi_bdev = dev->bdev;
+		submit_bio(bio->bi_rw, bio);
+	}
+}
+
+static void
+blktap_device_close_bdev(struct blktap *tap)
+{
+	struct blktap_device *dev;
+
+	dev = &tap->device;
+
+	if (dev->bdev)
+		blkdev_put(dev->bdev, FMODE_WRITE|FMODE_EXCL);
+
+	dev->bdev = NULL;
+	clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
+}
+
+static int
+blktap_device_open_bdev(struct blktap *tap, u32 pdev)
+{
+	struct block_device *bdev;
+	struct blktap_device *dev;
+
+	dev = &tap->device;
+
+	bdev = blkdev_get_by_dev(pdev, FMODE_WRITE|FMODE_EXCL, tap);
+	if (IS_ERR(bdev)) {
+		BTERR("opening device %x:%x failed: %ld\n",
+		      MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
+		return PTR_ERR(bdev);
+	}
+
+	if (!bdev->bd_disk) {
+		BTERR("device %x:%x doesn't exist\n",
+		      MAJOR(pdev), MINOR(pdev));
+		blkdev_put(bdev, FMODE_WRITE|FMODE_EXCL);
+		return -ENOENT;
+	}
+
+	dev->bdev = bdev;
+	set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
+
+	/* TODO: readjust queue parameters */
+
+	BTINFO("set device %d to passthrough on %x:%x\n",
+	       tap->minor, MAJOR(pdev), MINOR(pdev));
+
+	return 0;
+}
+
+int
+blktap_device_enable_passthrough(struct blktap *tap,
+				 unsigned major, unsigned minor)
+{
+	u32 pdev;
+	struct blktap_device *dev;
+
+	dev  = &tap->device;
+	pdev = MKDEV(major, minor);
+
+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+		return -EINVAL;
+
+	if (dev->bdev) {
+		if (pdev)
+			return -EINVAL;
+		blktap_device_close_bdev(tap);
+		return 0;
+	}
+
+	return blktap_device_open_bdev(tap, pdev);
+}
+#endif
+
+/*
+ * dev->lock held on entry
+ */
+static void
+blktap_device_run_queue(struct blktap *tap)
+{
+	int queued, err;
+	struct request_queue *rq;
+	struct request *req;
+	struct blktap_ring *ring;
+	struct blktap_device *dev;
+	struct blktap_request *request;
+
+	queued = 0;
+	ring   = &tap->ring;
+	dev    = &tap->device;
+	rq     = dev->gd->queue;
+
+	BTDBG("running queue for %d\n", tap->minor);
+
+	while ((req = blk_peek_request(rq)) != NULL) {
+		if (req->cmd_type != REQ_TYPE_FS) {
+			blk_start_request(req);
+			req->errors = (DID_ERROR << 16) |
+				      (DRIVER_INVALID << 24);
+			__blk_end_request_all(req, -EIO);
+			continue;
+		}
+
+		if (req->cmd_flags & (REQ_FLUSH|REQ_FUA)) {
+			blk_start_request(req);
+			__blk_end_request_all(req, -EOPNOTSUPP);
+			continue;
+		}
+
+#ifdef ENABLE_PASSTHROUGH
+		if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
+			blk_start_request(req);
+			blktap_device_forward_request(tap, req);
+			continue;
+		}
+#endif
+
+		if (RING_FULL(&ring->ring)) {
+		wait:
+			/* Avoid pointless unplugs. */
+			blk_stop_queue(rq);
+			blktap_defer(tap);
+			break;
+		}
+
+		request = blktap_request_allocate(tap);
+		if (!request) {
+			tap->stats.st_oo_req++;
+			goto wait;
+		}
+
+		BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
+		      "buffer:%p [%s], pending: %p\n", req, tap->minor,
+		      req->cmd, (unsigned long long)blk_rq_pos(req),
+		      blk_rq_cur_sectors(req), blk_rq_sectors(req), req->buffer,
+		      rq_data_dir(req) ? "write" : "read", request);
+
+		blk_start_request(req);
+
+		spin_unlock_irq(&dev->lock);
+		down_write(&tap->tap_sem);
+
+		err = blktap_device_process_request(tap, request, req);
+		if (!err)
+			queued++;
+		else {
+			blk_end_request_all(req, err);
+			blktap_request_free(tap, request);
+		}
+
+		up_write(&tap->tap_sem);
+		spin_lock_irq(&dev->lock);
+	}
+
+	if (queued)
+		blktap_ring_kick_user(tap);
+}
+
+/*
+ * dev->lock held on entry
+ */
+static void
+blktap_device_do_request(struct request_queue *rq)
+{
+	struct request *req;
+	struct blktap *tap;
+	struct blktap_device *dev;
+
+	dev = rq->queuedata;
+	if (!dev)
+		goto fail;
+
+	tap = dev_to_blktap(dev);
+	if (!blktap_active(tap))
+		goto fail;
+
+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
+	    test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
+		blktap_defer(tap);
+		return;
+	}
+
+	blktap_device_run_queue(tap);
+	return;
+
+fail:
+	while ((req = blk_fetch_request(rq))) {
+		if (req->cmd_type != REQ_TYPE_FS) {
+			unsigned long long sec = blk_rq_pos(req);
+
+			BTERR("device closed: failing secs %#Lx-%#Lx\n",
+			      sec, sec + blk_rq_sectors(req) - 1);
+		} else
+			req->errors = (DID_ERROR << 16)
+				      | (DRIVER_INVALID << 24);
+		__blk_end_request_all(req, -EIO);
+	}
+}
+
+void
+blktap_device_restart(struct blktap *tap)
+{
+	struct blktap_device *dev;
+
+	dev = &tap->device;
+
+	if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
+		blktap_defer(tap);
+		return;
+	}
+
+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
+	    test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
+		blktap_defer(tap);
+		return;
+	}
+
+	spin_lock_irq(&dev->lock);
+
+	/* Re-enable calldowns. */
+	if (dev->gd) {
+		struct request_queue *rq = dev->gd->queue;
+
+		if (blk_queue_stopped(rq))
+			blk_start_queue(rq);
+
+		/* Kick things off immediately. */
+		blktap_device_do_request(rq);
+	}
+
+	spin_unlock_irq(&dev->lock);
+}
+
+static void
+blktap_device_configure(struct blktap *tap)
+{
+	struct request_queue *rq;
+	struct blktap_device *dev = &tap->device;
+
+	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
+		return;
+
+	dev = &tap->device;
+	rq  = dev->gd->queue;
+
+	spin_lock_irq(&dev->lock);
+
+	set_capacity(dev->gd, tap->params.capacity);
+
+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
+	blk_queue_logical_block_size(rq, tap->params.sector_size);
+	blk_queue_max_hw_sectors(rq, 512);
+
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+	/* Ensure a merged request will fit in a single I/O ring slot. */
+	blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+	/* Make sure buffer addresses are sector-aligned. */
+	blk_queue_dma_alignment(rq, 511);
+
+	spin_unlock_irq(&dev->lock);
+}
+
+int
+blktap_device_resume(struct blktap *tap)
+{
+	int err;
+
+	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
+		return -ENODEV;
+
+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+		return 0;
+
+	err = blktap_ring_resume(tap);
+	if (err)
+		return err;
+
+	/* device size may have changed */
+	blktap_device_configure(tap);
+
+	BTDBG("restarting device\n");
+	blktap_device_restart(tap);
+
+	return 0;
+}
+
+int
+blktap_device_pause(struct blktap *tap)
+{
+	unsigned long flags;
+	struct blktap_device *dev = &tap->device;
+
+	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
+		return -ENODEV;
+
+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+		return 0;
+
+	spin_lock_irqsave(&dev->lock, flags);
+
+	blk_stop_queue(dev->gd->queue);
+	set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
+
+	spin_unlock_irqrestore(&dev->lock, flags);
+
+	return blktap_ring_pause(tap);
+}
+
+int
+blktap_device_destroy(struct blktap *tap)
+{
+	struct blktap_device *dev = &tap->device;
+	struct gendisk *gd = dev->gd;
+
+	if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+		return 0;
+
+	BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
+
+	if (dev->users)
+		return -EBUSY;
+
+	spin_lock_irq(&dev->lock);
+	/* No more blktap_device_do_request(). */
+	blk_stop_queue(gd->queue);
+	clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+	dev->gd = NULL;
+	spin_unlock_irq(&dev->lock);
+
+#ifdef ENABLE_PASSTHROUGH
+	if (dev->bdev)
+		blktap_device_close_bdev(tap);
+#endif
+
+	del_gendisk(gd);
+	blk_cleanup_queue(gd->queue);
+	put_disk(gd);
+
+	wake_up(&tap->wq);
+
+	return 0;
+}
+
+static char *blktap_devnode(struct gendisk *gd, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "tapdev%u",
+			 gd->first_minor);
+}
+
+int
+blktap_device_create(struct blktap *tap)
+{
+	int minor, err;
+	struct gendisk *gd;
+	struct request_queue *rq;
+	struct blktap_device *dev;
+
+	gd    = NULL;
+	rq    = NULL;
+	dev   = &tap->device;
+	minor = tap->minor;
+
+	if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+		return -EEXIST;
+
+	if (blktap_validate_params(tap, &tap->params))
+		return -EINVAL;
+
+	BTINFO("minor %d sectors %Lu sector-size %lu\n",
+	       minor, tap->params.capacity, tap->params.sector_size);
+
+	err = -ENODEV;
+
+	gd = alloc_disk(1);
+	if (!gd)
+		goto error;
+
+	if (minor < 26)
+		sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
+	else
+		sprintf(gd->disk_name, "tapdev%c%c",
+			'a' + ((minor / 26) - 1), 'a' + (minor % 26));
+
+	gd->major = blktap_device_major;
+	gd->first_minor = minor;
+	gd->devnode = blktap_devnode;
+	gd->fops = &blktap_device_file_operations;
+	gd->private_data = dev;
+
+	spin_lock_init(&dev->lock);
+	rq = blk_init_queue(blktap_device_do_request, &dev->lock);
+	if (!rq)
+		goto error;
+
+	elevator_init(rq, "noop");
+
+	gd->queue     = rq;
+	rq->queuedata = dev;
+	dev->gd       = gd;
+
+	set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+	blktap_device_configure(tap);
+
+	add_disk(gd);
+
+	err = 0;
+	goto out;
+
+ error:
+	if (gd)
+		del_gendisk(gd);
+	if (rq)
+		blk_cleanup_queue(rq);
+
+ out:
+	BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
+	return err;
+}
+
+int __init
+blktap_device_init(int *maj)
+{
+	int major;
+
+	/* Dynamically allocate a major for this device */
+	major = register_blkdev(0, "tapdev");
+	if (major < 0) {
+		BTERR("Couldn't register blktap device\n");
+		return -ENOMEM;
+	}	
+
+	blktap_device_major = *maj = major;
+	BTINFO("blktap device major %d\n", major);
+
+	return 0;
+}
+
+void
+blktap_device_free(void)
+{
+	if (blktap_device_major)
+		unregister_blkdev(blktap_device_major, "tapdev");
+}
diff --git a/drivers/xen/blktap2/request.c b/drivers/xen/blktap2/request.c
new file mode 100644
index 0000000..a27cf8a
--- /dev/null
+++ b/drivers/xen/blktap2/request.c
@@ -0,0 +1,296 @@
+#include <linux/spinlock.h>
+#include <xen/balloon.h>
+
+#include "blktap.h"
+
+#define MAX_BUCKETS                      8
+#define BUCKET_SIZE                      MAX_PENDING_REQS
+
+#define BLKTAP_POOL_CLOSING              1
+
+struct blktap_request_bucket;
+
+struct blktap_request_handle {
+	int                              slot;
+	uint8_t                          inuse;
+	struct blktap_request            request;
+	struct blktap_request_bucket    *bucket;
+};
+
+struct blktap_request_bucket {
+	atomic_t                         reqs_in_use;
+	struct blktap_request_handle     handles[BUCKET_SIZE];
+	struct page                    **foreign_pages;
+};
+
+struct blktap_request_pool {
+	spinlock_t                       lock;
+	uint8_t                          status;
+	struct list_head                 free_list;
+	atomic_t                         reqs_in_use;
+	wait_queue_head_t                wait_queue;
+	struct blktap_request_bucket    *buckets[MAX_BUCKETS];
+};
+
+static struct blktap_request_pool pool;
+
+static inline struct blktap_request_handle *
+blktap_request_to_handle(struct blktap_request *req)
+{
+	return container_of(req, struct blktap_request_handle, request);
+}
+
+static void
+blktap_request_pool_init_request(struct blktap_request *request)
+{
+	int i;
+
+	request->usr_idx  = -1;
+	request->nr_pages = 0;
+	request->status   = BLKTAP_REQUEST_FREE;
+	INIT_LIST_HEAD(&request->free_list);
+	for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
+		request->handles[i].user   = INVALID_GRANT_HANDLE;
+		request->handles[i].kernel = INVALID_GRANT_HANDLE;
+	}
+}
+
+static int
+blktap_request_pool_allocate_bucket(void)
+{
+	int i, idx;
+	unsigned long flags;
+	struct blktap_request *request;
+	struct blktap_request_handle *handle;
+	struct blktap_request_bucket *bucket;
+
+	bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
+	if (!bucket)
+		goto fail;
+
+	bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
+	if (!bucket->foreign_pages)
+		goto fail;
+
+	spin_lock_irqsave(&pool.lock, flags);
+
+	idx = -1;
+	for (i = 0; i < MAX_BUCKETS; i++) {
+		if (!pool.buckets[i]) {
+			idx = i;
+			pool.buckets[idx] = bucket;
+			break;
+		}
+	}
+
+	if (idx == -1) {
+		spin_unlock_irqrestore(&pool.lock, flags);
+		goto fail;
+	}
+
+	for (i = 0; i < BUCKET_SIZE; i++) {
+		handle  = bucket->handles + i;
+		request = &handle->request;
+
+		handle->slot   = i;
+		handle->inuse  = 0;
+		handle->bucket = bucket;
+
+		blktap_request_pool_init_request(request);
+		list_add_tail(&request->free_list, &pool.free_list);
+	}
+
+	spin_unlock_irqrestore(&pool.lock, flags);
+
+	return 0;
+
+fail:
+	if (bucket && bucket->foreign_pages)
+		free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
+	kfree(bucket);
+	return -ENOMEM;
+}
+
+static void
+blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
+{
+	if (!bucket)
+		return;
+
+	BTDBG("freeing bucket %p\n", bucket);
+
+	free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
+	kfree(bucket);
+}
+
+struct page *
+request_to_page(struct blktap_request *req, int seg)
+{
+	struct blktap_request_handle *handle = blktap_request_to_handle(req);
+	int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+	return handle->bucket->foreign_pages[idx];
+}
+
+int
+blktap_request_pool_shrink(void)
+{
+	int i, err;
+	unsigned long flags;
+	struct blktap_request_bucket *bucket;
+
+	err = -EAGAIN;
+
+	spin_lock_irqsave(&pool.lock, flags);
+
+	/* always keep at least one bucket */
+	for (i = 1; i < MAX_BUCKETS; i++) {
+		bucket = pool.buckets[i];
+		if (!bucket)
+			continue;
+
+		if (atomic_read(&bucket->reqs_in_use))
+			continue;
+
+		blktap_request_pool_free_bucket(bucket);
+		pool.buckets[i] = NULL;
+		err = 0;
+		break;
+	}
+
+	spin_unlock_irqrestore(&pool.lock, flags);
+
+	return err;
+}
+
+int
+blktap_request_pool_grow(void)
+{
+	return blktap_request_pool_allocate_bucket();
+}
+
+struct blktap_request *
+blktap_request_allocate(struct blktap *tap)
+{
+	int i;
+	uint16_t usr_idx;
+	unsigned long flags;
+	struct blktap_request *request;
+
+	usr_idx = -1;
+	request = NULL;
+
+	spin_lock_irqsave(&pool.lock, flags);
+
+	if (pool.status == BLKTAP_POOL_CLOSING)
+		goto out;
+
+	for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
+		if (!tap->pending_requests[i]) {
+			usr_idx = i;
+			break;
+		}
+
+	if (usr_idx == (uint16_t)-1)
+		goto out;
+
+	if (!list_empty(&pool.free_list)) {
+		request = list_entry(pool.free_list.next,
+				     struct blktap_request, free_list);
+		list_del(&request->free_list);
+	}
+
+	if (request) {
+		struct blktap_request_handle *handle;
+
+		atomic_inc(&pool.reqs_in_use);
+
+		handle = blktap_request_to_handle(request);
+		atomic_inc(&handle->bucket->reqs_in_use);
+		handle->inuse = 1;
+
+		request->usr_idx = usr_idx;
+
+		tap->pending_requests[usr_idx] = request;
+		tap->pending_cnt++;
+	}
+
+out:
+	spin_unlock_irqrestore(&pool.lock, flags);
+	return request;
+}
+
+void
+blktap_request_free(struct blktap *tap, struct blktap_request *request)
+{
+	int free;
+	unsigned long flags;
+	struct blktap_request_handle *handle;
+
+	BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
+	handle = blktap_request_to_handle(request);
+
+	spin_lock_irqsave(&pool.lock, flags);
+
+	handle->inuse = 0;
+	tap->pending_requests[request->usr_idx] = NULL;
+	blktap_request_pool_init_request(request);
+	list_add(&request->free_list, &pool.free_list);
+	atomic_dec(&handle->bucket->reqs_in_use);
+	free = atomic_dec_and_test(&pool.reqs_in_use);
+
+	spin_unlock_irqrestore(&pool.lock, flags);
+
+	if (--tap->pending_cnt == 0)
+		wake_up_interruptible(&tap->wq);
+
+	if (free)
+		wake_up(&pool.wait_queue);
+}
+
+void
+blktap_request_pool_free(void)
+{
+	int i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool.lock, flags);
+
+	pool.status = BLKTAP_POOL_CLOSING;
+	while (atomic_read(&pool.reqs_in_use)) {
+		spin_unlock_irqrestore(&pool.lock, flags);
+		wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
+		spin_lock_irqsave(&pool.lock, flags);
+	}
+
+	for (i = 0; i < MAX_BUCKETS; i++) {
+		blktap_request_pool_free_bucket(pool.buckets[i]);
+		pool.buckets[i] = NULL;
+	}
+
+	spin_unlock_irqrestore(&pool.lock, flags);
+}
+
+int __init
+blktap_request_pool_init(void)
+{
+	int i, err;
+
+	memset(&pool, 0, sizeof(pool));
+
+	spin_lock_init(&pool.lock);
+	INIT_LIST_HEAD(&pool.free_list);
+	atomic_set(&pool.reqs_in_use, 0);
+	init_waitqueue_head(&pool.wait_queue);
+
+	for (i = 0; i < 2; i++) {
+		err = blktap_request_pool_allocate_bucket();
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	blktap_request_pool_free();
+	return err;
+}
diff --git a/drivers/xen/blktap2/ring.c b/drivers/xen/blktap2/ring.c
new file mode 100644
index 0000000..28de657
--- /dev/null
+++ b/drivers/xen/blktap2/ring.c
@@ -0,0 +1,610 @@
+#include <linux/module.h>
+#include <linux/signal.h>
+
+#include "blktap.h"
+
+static int blktap_ring_major;
+
+static inline struct blktap *
+vma_to_blktap(struct vm_area_struct *vma)
+{
+	struct vm_foreign_map *m = vma->vm_private_data;
+	struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
+	return container_of(r, struct blktap, ring);
+}
+
+ /* 
+  * BLKTAP - immediately before the mmap area,
+  * we have a bunch of pages reserved for shared memory rings.
+  */
+#define RING_PAGES 1
+
+static int
+blktap_read_ring(struct blktap *tap)
+{
+	/* This is called to read responses from the ring. */
+	int usr_idx;
+	RING_IDX rc, rp;
+	blkif_response_t res;
+	struct blktap_ring *ring;
+	struct blktap_request *request;
+
+	down_read(&tap->tap_sem);
+
+	ring = &tap->ring;
+	if (!ring->vma) {
+		up_read(&tap->tap_sem);
+		return 0;
+	}
+
+	/* for each outstanding message on the ring  */
+	rp = ring->ring.sring->rsp_prod;
+	rmb();
+
+	for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
+		memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
+		mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
+		++ring->ring.rsp_cons;
+
+		usr_idx = (int)res.id;
+		if (usr_idx >= MAX_PENDING_REQS ||
+		    !tap->pending_requests[usr_idx]) {
+			BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
+			       rc, rp, usr_idx, tap->pid, ring->vma);
+			continue;
+		}
+
+		request = tap->pending_requests[usr_idx];
+		BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
+		blktap_device_finish_request(tap, &res, request);
+	}
+
+	up_read(&tap->tap_sem);
+
+	blktap_run_deferred();
+
+	return 0;
+}
+
+static int
+blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	/*
+	 * if the page has not been mapped in by the driver then return
+	 * VM_FAULT_SIGBUS to the domain.
+	 */
+
+	return VM_FAULT_SIGBUS;
+}
+
+static pte_t
+blktap_ring_clear_pte(struct vm_area_struct *vma,
+		      unsigned long uvaddr,
+		      pte_t *ptep, int is_fullmm)
+{
+	pte_t copy;
+	struct blktap *tap;
+	unsigned long kvaddr;
+	struct page **map, *page;
+	struct blktap_ring *ring;
+	struct blktap_request *request;
+	struct grant_handle_pair *khandle;
+	struct gnttab_unmap_grant_ref unmap[2];
+	int offset, seg, usr_idx, count = 0;
+
+	tap  = vma_to_blktap(vma);
+	ring = &tap->ring;
+	map  = ring->foreign_map.map;
+	BUG_ON(!map);	/* TODO Should this be changed to if statement? */
+
+	/*
+	 * Zap entry if the address is before the start of the grant
+	 * mapped region.
+	 */
+	if (uvaddr < ring->user_vstart)
+		return xen_ptep_get_and_clear_full(vma, uvaddr,
+						   ptep, is_fullmm);
+
+	offset  = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
+	usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	seg     = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	offset  = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
+	page    = map[offset];
+	if (page && PageBlkback(page)) {
+		ClearPageBlkback(page);
+		set_page_private(page, 0);
+	}
+	map[offset] = NULL;
+
+	request = tap->pending_requests[usr_idx];
+	kvaddr  = request_to_kaddr(request, seg);
+	khandle = request->handles + seg;
+
+	if (khandle->kernel != INVALID_GRANT_HANDLE) {
+		gnttab_set_unmap_op(&unmap[count], kvaddr, 
+				    GNTMAP_host_map, khandle->kernel);
+		count++;
+
+		set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, 
+				    INVALID_P2M_ENTRY);
+	}
+
+
+	if (khandle->user != INVALID_GRANT_HANDLE) {
+		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+
+		copy = *ptep;
+		gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep), 
+				    GNTMAP_host_map 
+				    | GNTMAP_application_map 
+				    | GNTMAP_contains_pte,
+				    khandle->user);
+		count++;
+	} else
+		copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+						   is_fullmm);
+
+	if (count)
+		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+					      unmap, count))
+			BUG();
+
+	khandle->kernel = INVALID_GRANT_HANDLE;
+	khandle->user   = INVALID_GRANT_HANDLE;
+
+	return copy;
+}
+
+static void
+blktap_ring_vm_unmap(struct vm_area_struct *vma)
+{
+	struct blktap *tap = vma_to_blktap(vma);
+
+	down_write(&tap->tap_sem);
+	clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
+	clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
+	clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
+	up_write(&tap->tap_sem);
+}
+
+static void
+blktap_ring_vm_close(struct vm_area_struct *vma)
+{
+	struct blktap *tap = vma_to_blktap(vma);
+	struct blktap_ring *ring = &tap->ring;
+
+	blktap_ring_vm_unmap(vma);                 /* fail future requests */
+	blktap_device_fail_pending_requests(tap);  /* fail pending requests */
+	blktap_device_restart(tap);                /* fail deferred requests */
+
+	down_write(&tap->tap_sem);
+
+	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+
+	kfree(ring->foreign_map.map);
+	ring->foreign_map.map = NULL;
+
+	/* Free the ring page. */
+	ClearPageReserved(virt_to_page(ring->ring.sring));
+	free_page((unsigned long)ring->ring.sring);
+
+	BTINFO("unmapping ring %d\n", tap->minor);
+	ring->ring.sring = NULL;
+	ring->vma = NULL;
+
+	up_write(&tap->tap_sem);
+
+	wake_up(&tap->wq);
+}
+
+static struct vm_operations_struct blktap_ring_vm_operations = {
+	.close    = blktap_ring_vm_close,
+	.unmap    = blktap_ring_vm_unmap,
+	.fault    = blktap_ring_fault,
+	.zap_pte  = blktap_ring_clear_pte,
+};
+
+static int
+blktap_ring_open(struct inode *inode, struct file *filp)
+{
+	int idx;
+	struct blktap *tap;
+
+	idx = iminor(inode);
+	if (idx < 0 || idx >= CONFIG_XEN_NR_TAP2_DEVICES || !blktaps[idx]) {
+		BTERR("unable to open device blktap%d\n", idx);
+		return -ENODEV;
+	}
+
+	tap = blktaps[idx];
+
+	BTINFO("opening device blktap%d\n", idx);
+
+	if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
+		return -ENODEV;
+
+	/* Only one process can access ring at a time */
+	if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
+		return -EBUSY;
+
+	filp->private_data = tap;
+	BTINFO("opened device %d\n", tap->minor);
+
+	return 0;
+}
+
+static int
+blktap_ring_release(struct inode *inode, struct file *filp)
+{
+	struct blktap *tap = filp->private_data;
+
+	BTINFO("freeing device %d\n", tap->minor);
+	clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
+	filp->private_data = NULL;
+	wake_up(&tap->wq);	
+	return 0;
+}
+
+/* Note on mmap:
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them.  This couldn't be done before, because
+ * there isn't really a sane way to translate a user virtual address down to a 
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space.  This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
+ * from pages to actual page structs.  There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ */
+static int
+blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	int size, err;
+	struct page **map;
+	struct blktap *tap;
+	blkif_sring_t *sring;
+	struct blktap_ring *ring;
+
+	tap   = filp->private_data;
+	ring  = &tap->ring;
+	map   = NULL;
+	sring = NULL;
+
+	if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
+		return -ENOMEM;
+
+	size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	if (size != (MMAP_PAGES + RING_PAGES)) {
+		BTERR("you _must_ map exactly %lu pages!\n",
+		      MMAP_PAGES + RING_PAGES);
+		return -EAGAIN;
+	}
+
+	/* Allocate the fe ring. */
+	sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+	if (!sring) {
+		BTERR("Couldn't alloc sring.\n");
+		goto fail_mem;
+	}
+
+	map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
+	if (!map) {
+		BTERR("Couldn't alloc VM_FOREIGN map.\n");
+		goto fail_mem;
+	}
+
+	SetPageReserved(virt_to_page(sring));
+    
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
+
+	ring->ring_vstart = vma->vm_start;
+	ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
+
+	/* Map the ring pages to the start of the region and reserve it. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		err = vm_insert_page(vma, vma->vm_start,
+				     virt_to_page(ring->ring.sring));
+	else
+		err = remap_pfn_range(vma, vma->vm_start,
+				      __pa(ring->ring.sring) >> PAGE_SHIFT,
+				      PAGE_SIZE, vma->vm_page_prot);
+	if (err) {
+		BTERR("Mapping user ring failed: %d\n", err);
+		goto fail;
+	}
+
+	/* Mark this VM as containing foreign pages, and set up mappings. */
+	ring->foreign_map.map = map;
+	vma->vm_private_data = &ring->foreign_map;
+	vma->vm_flags |= VM_FOREIGN;
+	vma->vm_flags |= VM_DONTCOPY;
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &blktap_ring_vm_operations;
+
+#ifdef CONFIG_X86
+	vma->vm_mm->context.has_foreign_mappings = 1;
+#endif
+
+	tap->pid = current->pid;
+	BTINFO("blktap: mapping pid is %d\n", tap->pid);
+
+	ring->vma = vma;
+	return 0;
+
+ fail:
+	/* Clear any active mappings. */
+	zap_page_range(vma, vma->vm_start, 
+		       vma->vm_end - vma->vm_start, NULL);
+	ClearPageReserved(virt_to_page(sring));
+ fail_mem:
+	free_page((unsigned long)sring);
+	kfree(map);
+
+	return -ENOMEM;
+}
+
+static inline void
+blktap_ring_set_message(struct blktap *tap, int msg)
+{
+	struct blktap_ring *ring = &tap->ring;
+
+	down_read(&tap->tap_sem);
+	if (ring->ring.sring)
+		ring->ring.sring->private.tapif_user.msg = msg;
+	up_read(&tap->tap_sem);
+}
+
+static long
+blktap_ring_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct blktap_params params;
+	struct blktap *tap = filp->private_data;
+
+	BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
+
+	switch(cmd) {
+	case BLKTAP2_IOCTL_KICK_FE:
+		/* There are fe messages to process. */
+		return blktap_read_ring(tap);
+
+	case BLKTAP2_IOCTL_CREATE_DEVICE:
+		if (!arg)
+			return -EINVAL;
+
+		if (copy_from_user(&params, (struct blktap_params __user *)arg,
+				   sizeof(params))) {
+			BTERR("failed to get params\n");
+			return -EFAULT;
+		}
+
+		if (blktap_validate_params(tap, &params)) {
+			BTERR("invalid params\n");
+			return -EINVAL;
+		}
+
+		tap->params = params;
+		return blktap_device_create(tap);
+
+	case BLKTAP2_IOCTL_SET_PARAMS:
+		if (!arg)
+			return -EINVAL;
+
+		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+			return -EINVAL;
+
+		if (copy_from_user(&params, (struct blktap_params __user *)arg,
+				   sizeof(params))) {
+			BTERR("failed to get params\n");
+			return -EFAULT;
+		}
+
+		if (blktap_validate_params(tap, &params)) {
+			BTERR("invalid params\n");
+			return -EINVAL;
+		}
+
+		tap->params = params;
+		return 0;
+
+	case BLKTAP2_IOCTL_PAUSE:
+		if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
+			return -EINVAL;
+
+		set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
+		clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
+
+		blktap_ring_set_message(tap, 0);
+		wake_up_interruptible(&tap->wq);
+
+		return 0;
+
+
+	case BLKTAP2_IOCTL_REOPEN:
+		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+			return -EINVAL;
+
+		if (!arg)
+			return -EINVAL;
+
+		if (copy_to_user((char __user *)arg,
+				 tap->params.name,
+				 strlen(tap->params.name) + 1))
+			return -EFAULT;
+
+		blktap_ring_set_message(tap, 0);
+		wake_up_interruptible(&tap->wq);
+
+		return 0;
+
+	case BLKTAP2_IOCTL_RESUME:
+		if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+			return -EINVAL;
+
+		tap->ring.response = (int)arg;
+		if (!tap->ring.response)
+			clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
+
+		blktap_ring_set_message(tap, 0);
+		wake_up_interruptible(&tap->wq);
+
+		return 0;
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
+{
+	struct blktap *tap = filp->private_data;
+	struct blktap_ring *ring = &tap->ring;
+
+	poll_wait(filp, &ring->poll_wait, wait);
+	if (ring->ring.sring->private.tapif_user.msg ||
+	    ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
+		RING_PUSH_REQUESTS(&ring->ring);
+		return POLLIN | POLLRDNORM;
+	}
+
+	return 0;
+}
+
+static const struct file_operations blktap_ring_file_operations = {
+	.owner    = THIS_MODULE,
+	.open     = blktap_ring_open,
+	.release  = blktap_ring_release,
+	.unlocked_ioctl = blktap_ring_ioctl,
+	.mmap     = blktap_ring_mmap,
+	.poll     = blktap_ring_poll,
+};
+
+void
+blktap_ring_kick_user(struct blktap *tap)
+{
+	wake_up_interruptible(&tap->ring.poll_wait);
+}
+
+int
+blktap_ring_resume(struct blktap *tap)
+{
+	int err;
+	struct blktap_ring *ring = &tap->ring;
+
+	if (!blktap_active(tap))
+		return -ENODEV;
+
+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+		return -EINVAL;
+
+	/* set shared flag for resume */
+	ring->response = 0;
+
+	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
+	blktap_ring_kick_user(tap);
+
+	wait_event_interruptible(tap->wq, ring->response ||
+				 !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
+
+	err = ring->response;
+	ring->response = 0;
+
+	BTDBG("err: %d\n", err);
+
+	if (err)
+		return err;
+
+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+		return -EAGAIN;
+
+	return 0;
+}
+
+int
+blktap_ring_pause(struct blktap *tap)
+{
+	if (!blktap_active(tap))
+		return -ENODEV;
+
+	if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
+		return -EINVAL;
+
+	BTDBG("draining queue\n");
+	wait_event_interruptible(tap->wq, !tap->pending_cnt);
+	if (tap->pending_cnt)
+		return -EAGAIN;
+
+	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
+	blktap_ring_kick_user(tap);
+
+	BTDBG("waiting for tapdisk response\n");
+	wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+		return -EAGAIN;
+
+	return 0;
+}
+
+int
+blktap_ring_destroy(struct blktap *tap)
+{
+	if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
+	    !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
+		return 0;
+
+	BTDBG("sending tapdisk close message\n");
+	blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
+	blktap_ring_kick_user(tap);
+
+	return -EAGAIN;
+}
+
+static void
+blktap_ring_initialize(struct blktap_ring *ring, int minor)
+{
+	memset(ring, 0, sizeof(*ring));
+	init_waitqueue_head(&ring->poll_wait);
+	ring->devno = MKDEV(blktap_ring_major, minor);
+}
+
+int
+blktap_ring_create(struct blktap *tap)
+{
+	struct blktap_ring *ring = &tap->ring;
+	blktap_ring_initialize(ring, tap->minor);
+	return blktap_sysfs_create(tap);
+}
+
+int __init
+blktap_ring_init(int *major)
+{
+	int err;
+
+	err = __register_chrdev(0, 0, CONFIG_XEN_NR_TAP2_DEVICES, "blktap2",
+				&blktap_ring_file_operations);
+	if (err < 0) {
+		BTERR("error registering blktap ring device: %d\n", err);
+		return err;
+	}
+
+	blktap_ring_major = *major = err;
+	BTINFO("blktap ring major: %d\n", blktap_ring_major);
+	return 0;
+}
+
+int
+blktap_ring_free(void)
+{
+	if (blktap_ring_major)
+		__unregister_chrdev(blktap_ring_major, 0,
+				    CONFIG_XEN_NR_TAP2_DEVICES, "blktap2");
+
+	return 0;
+}
diff --git a/drivers/xen/blktap2/sysfs.c b/drivers/xen/blktap2/sysfs.c
new file mode 100644
index 0000000..26a3d93
--- /dev/null
+++ b/drivers/xen/blktap2/sysfs.c
@@ -0,0 +1,475 @@
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/module.h>
+
+#include "blktap.h"
+
+int blktap_debug_level = 1;
+
+static struct class *class;
+static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq);
+
+static inline void
+blktap_sysfs_get(struct blktap *tap)
+{
+	atomic_inc(&tap->ring.sysfs_refcnt);
+}
+
+static inline void
+blktap_sysfs_put(struct blktap *tap)
+{
+	if (atomic_dec_and_test(&tap->ring.sysfs_refcnt))
+		wake_up(&sysfs_wq);
+}
+
+static inline void
+blktap_sysfs_enter(struct blktap *tap)
+{
+	blktap_sysfs_get(tap);               /* pin sysfs device */
+	mutex_lock(&tap->ring.sysfs_mutex);  /* serialize sysfs operations */
+}
+
+static inline void
+blktap_sysfs_exit(struct blktap *tap)
+{
+	mutex_unlock(&tap->ring.sysfs_mutex);
+	blktap_sysfs_put(tap);
+}
+
+static ssize_t blktap_sysfs_pause_device(struct device *,
+					 struct device_attribute *,
+					 const char *, size_t);
+static DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
+static ssize_t blktap_sysfs_resume_device(struct device *,
+					  struct device_attribute *,
+					  const char *, size_t);
+static DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
+
+static ssize_t
+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr,
+		      const char *buf, size_t size)
+{
+	int err;
+	struct blktap *tap = dev_get_drvdata(dev);
+
+	blktap_sysfs_enter(tap);
+
+	if (!tap->ring.dev ||
+	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	if (size > BLKTAP2_MAX_MESSAGE_LEN) {
+		err = -ENAMETOOLONG;
+		goto out;
+	}
+
+	if (strnlen(buf, size) >= size) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	strlcpy(tap->params.name, buf, size);
+	err = size;
+
+out:
+	blktap_sysfs_exit(tap);	
+	return err;
+}
+
+static ssize_t
+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr,
+		      char *buf)
+{
+	ssize_t size;
+	struct blktap *tap = dev_get_drvdata(dev);
+
+	blktap_sysfs_enter(tap);
+
+	if (!tap->ring.dev)
+		size = -ENODEV;
+	else if (tap->params.name[0])
+		size = sprintf(buf, "%s\n", tap->params.name);
+	else
+		size = sprintf(buf, "%d\n", tap->minor);
+
+	blktap_sysfs_exit(tap);
+
+	return size;
+}
+static DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
+		   blktap_sysfs_get_name, blktap_sysfs_set_name);
+
+static ssize_t
+blktap_sysfs_remove_device(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t size)
+{
+	int err;
+	struct blktap *tap = dev_get_drvdata(dev);
+
+	if (!tap->ring.dev)
+		return size;
+
+	if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+		return -EBUSY;
+
+	err = blktap_control_destroy_device(tap);
+
+	return (err ? : size);
+}
+static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+
+static ssize_t
+blktap_sysfs_pause_device(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t size)
+{
+	int err;
+	struct blktap *tap = dev_get_drvdata(dev);
+
+	blktap_sysfs_enter(tap);
+
+	BTDBG("pausing %u:%u: dev_inuse: %lu\n",
+	      MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse);
+
+	if (!tap->ring.dev ||
+	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
+		err = 0;
+		goto out;
+	}
+
+	err = blktap_device_pause(tap);
+	if (!err) {
+		device_remove_file(dev, &dev_attr_pause);
+		err = device_create_file(dev, &dev_attr_resume);
+	}
+
+out:
+	blktap_sysfs_exit(tap);
+
+	return (err ? err : size);
+}
+
+static ssize_t
+blktap_sysfs_resume_device(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t size)
+{
+	int err;
+	struct blktap *tap = dev_get_drvdata(dev);
+
+	blktap_sysfs_enter(tap);
+
+	if (!tap->ring.dev ||
+	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = blktap_device_resume(tap);
+	if (!err) {
+		device_remove_file(dev, &dev_attr_resume);
+		err = device_create_file(dev, &dev_attr_pause);
+	}
+
+out:
+	blktap_sysfs_exit(tap);
+
+	BTDBG("returning %zd\n", (err ? err : size));
+	return (err ? err : size);
+}
+
+#ifdef ENABLE_PASSTHROUGH
+static ssize_t
+blktap_sysfs_enable_passthrough(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t size)
+{
+	int err;
+	unsigned major, minor;
+	struct blktap *tap = dev_get_drvdata(dev);
+
+	BTINFO("passthrough request enabled\n");
+
+	blktap_sysfs_enter(tap);
+
+	if (!tap->ring.dev ||
+	    test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = sscanf(buf, "%x:%x", &major, &minor);
+	if (err != 2) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = blktap_device_enable_passthrough(tap, major, minor);
+
+out:
+	blktap_sysfs_exit(tap);
+	BTDBG("returning %d\n", (err ? err : size));
+	return (err ? err : size);
+}
+#endif
+
+static ssize_t
+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	char *tmp;
+	int i, ret;
+	struct blktap *tap = dev_get_drvdata(dev);
+
+	tmp = buf;
+	blktap_sysfs_get(tap);
+
+	if (!tap->ring.dev) {
+		ret = sprintf(tmp, "no device\n");
+		goto out;
+	}
+
+	tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n",
+		       tap->params.name, MAJOR(tap->ring.devno),
+		       MINOR(tap->ring.devno), atomic_read(&tap->refcnt),
+		       tap->dev_inuse);
+	tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, "
+		       "device users: %d\n", tap->params.capacity,
+		       tap->params.sector_size, tap->device.users);
+
+	down_read(&tap->tap_sem);
+
+	tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt);
+	for (i = 0; i < MAX_PENDING_REQS; i++) {
+		struct blktap_request *req = tap->pending_requests[i];
+		if (!req)
+			continue;
+
+		tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, "
+			       "status: 0x%02x, pendcnt: %d, "
+			       "nr_pages: %u, op: %d, time: %lu:%lu\n",
+			       i, (unsigned long long)req->id, req->usr_idx,
+			       req->status, atomic_read(&req->pendcnt),
+			       req->nr_pages, req->operation, req->time.tv_sec,
+			       req->time.tv_usec);
+	}
+
+	up_read(&tap->tap_sem);
+	ret = (tmp - buf) + 1;
+
+out:
+	blktap_sysfs_put(tap);
+	BTDBG("%s\n", buf);
+
+	return ret;
+}
+static DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
+
+int
+blktap_sysfs_create(struct blktap *tap)
+{
+	struct blktap_ring *ring;
+	struct device *dev;
+	int err, state = 0;
+
+	if (!class)
+		return -ENODEV;
+
+	ring = &tap->ring;
+
+	dev = device_create(class, NULL, ring->devno, tap,
+			    "blktap%d", tap->minor);
+	if (IS_ERR(dev))
+		return PTR_ERR(dev);
+
+	ring->dev = dev;
+
+	mutex_init(&ring->sysfs_mutex);
+	atomic_set(&ring->sysfs_refcnt, 0);
+	set_bit(BLKTAP_SYSFS, &tap->dev_inuse);
+
+	err = device_create_file(dev, &dev_attr_name);
+	if (!err) {
+		++state;
+		err = device_create_file(dev, &dev_attr_remove);
+	}
+	if (!err) {
+		++state;
+		err = device_create_file(dev, &dev_attr_pause);
+	}
+	if (!err) {
+		++state;
+		err = device_create_file(dev, &dev_attr_debug);
+	}
+
+	switch (state * !!err) {
+	case 3: device_remove_file(dev, &dev_attr_pause);
+	case 2: device_remove_file(dev, &dev_attr_remove);
+	case 1: device_remove_file(dev, &dev_attr_name);
+	}
+
+	return err;
+}
+
+static void
+_blktap_sysfs_destroy(struct device *dev)
+{
+	struct blktap *tap = dev_get_drvdata(dev);
+
+	device_remove_file(dev, &dev_attr_name);
+	device_remove_file(dev, &dev_attr_remove);
+	device_remove_file(dev, &dev_attr_pause);
+	device_remove_file(dev, &dev_attr_resume);
+	device_remove_file(dev, &dev_attr_debug);
+
+	device_unregister(dev);
+
+	clear_bit(BLKTAP_SYSFS, &tap->dev_inuse);
+
+	blktap_control_finish_destroy(tap);
+}
+
+int
+blktap_sysfs_destroy(struct blktap *tap)
+{
+	struct blktap_ring *ring;
+	struct device *dev;
+
+	ring = &tap->ring;
+	dev  = ring->dev;
+	if (!class || !dev)
+		return 0;
+
+	ring->dev = NULL;
+	if (wait_event_interruptible(sysfs_wq,
+				     !atomic_read(&tap->ring.sysfs_refcnt)))
+		return -EAGAIN;
+
+	return device_schedule_callback(dev, _blktap_sysfs_destroy);
+}
+
+static ssize_t
+blktap_sysfs_show_verbosity(struct class *class, struct class_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "%d\n", blktap_debug_level);
+}
+
+static ssize_t
+blktap_sysfs_set_verbosity(struct class *class, struct class_attribute *attr,
+			   const char *buf, size_t size)
+{
+	int level;
+
+	if (sscanf(buf, "%d", &level) == 1) {
+		blktap_debug_level = level;
+		return size;
+	}
+
+	return -EINVAL;
+}
+static CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR,
+		  blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
+
+static ssize_t
+blktap_sysfs_show_devices(struct class *class, struct class_attribute *attr,
+			  char *buf)
+{
+	int i, ret;
+	struct blktap *tap;
+
+	ret = 0;
+	for (i = 0; i < CONFIG_XEN_NR_TAP2_DEVICES; i++) {
+		tap = blktaps[i];
+		if (!tap)
+			continue;
+
+		if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+			continue;
+
+		ret += sprintf(buf + ret, "%d ", tap->minor);
+		ret += snprintf(buf + ret, sizeof(tap->params.name) - 1,
+				tap->params.name);
+		ret += sprintf(buf + ret, "\n");
+	}
+
+	return ret;
+}
+static CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL);
+
+void
+blktap_sysfs_free(void)
+{
+	if (!class)
+		return;
+
+	class_remove_file(class, &class_attr_verbosity);
+	class_remove_file(class, &class_attr_devices);
+
+	class_destroy(class);
+}
+
+static char *blktap_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "blktap%u",
+			 MINOR(dev->devt));
+}
+
+int __init
+blktap_sysfs_init(void)
+{
+	struct class *cls;
+	int err;
+
+	if (class)
+		return -EEXIST;
+
+	cls = class_create(THIS_MODULE, "blktap2");
+	if (IS_ERR(cls))
+		return PTR_ERR(cls);
+
+	cls->devnode = blktap_devnode;
+
+	err = class_create_file(cls, &class_attr_verbosity);
+	if (!err) {
+		err = class_create_file(cls, &class_attr_devices);
+		if (err)
+			class_remove_file(cls, &class_attr_verbosity);
+	}
+	if (!err)
+		class = cls;
+	else
+		class_destroy(cls);
+
+	return err;
+}
diff --git a/drivers/xen/blktap2/wait_queue.c b/drivers/xen/blktap2/wait_queue.c
new file mode 100644
index 0000000..f8995aa
--- /dev/null
+++ b/drivers/xen/blktap2/wait_queue.c
@@ -0,0 +1,40 @@
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include "blktap.h"
+
+static LIST_HEAD(deferred_work_queue);
+static DEFINE_SPINLOCK(deferred_work_lock);
+
+void
+blktap_run_deferred(void)
+{
+	LIST_HEAD(queue);
+	struct blktap *tap;
+	unsigned long flags;
+
+	spin_lock_irqsave(&deferred_work_lock, flags);
+	list_splice_init(&deferred_work_queue, &queue);
+	list_for_each_entry(tap, &queue, deferred_queue)
+		clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
+	spin_unlock_irqrestore(&deferred_work_lock, flags);
+
+	while (!list_empty(&queue)) {
+		tap = list_entry(queue.next, struct blktap, deferred_queue);
+		list_del_init(&tap->deferred_queue);
+		blktap_device_restart(tap);
+	}
+}
+
+void
+blktap_defer(struct blktap *tap)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&deferred_work_lock, flags);
+	if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) {
+		set_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
+		list_add_tail(&tap->deferred_queue, &deferred_work_queue);
+	}
+	spin_unlock_irqrestore(&deferred_work_lock, flags);
+}
diff --git a/drivers/xen/char/Makefile b/drivers/xen/char/Makefile
new file mode 100644
index 0000000..13604ad
--- /dev/null
+++ b/drivers/xen/char/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_XEN_DEVMEM)	:= mem.o
diff --git a/drivers/xen/char/mem.c b/drivers/xen/char/mem.c
new file mode 100644
index 0000000..f55cff3
--- /dev/null
+++ b/drivers/xen/char/mem.c
@@ -0,0 +1,222 @@
+/*
+ *  Originally from linux/drivers/char/mem.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Added devfs support.
+ *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
+ *  Shared /dev/zero mmapping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/ptrace.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/hypervisor.h>
+
+static inline unsigned long size_inside_page(unsigned long start,
+					     unsigned long size)
+{
+	unsigned long sz;
+
+	sz = PAGE_SIZE - (start & (PAGE_SIZE - 1));
+
+	return min(sz, size);
+}
+
+static inline int uncached_access(struct file *file)
+{
+	if (file->f_flags & O_DSYNC)
+		return 1;
+	/* Xen sets correct MTRR type on non-RAM for us. */
+	return 0;
+}
+
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+#ifdef CONFIG_STRICT_DEVMEM
+	u64 from = ((u64)pfn) << PAGE_SHIFT;
+	u64 to = from + size;
+	u64 cursor = from;
+
+	while (cursor < to) {
+		if (!devmem_is_allowed(pfn)) {
+			printk(KERN_INFO
+		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+				current->comm, from, to);
+			return 0;
+		}
+		cursor += PAGE_SIZE;
+		pfn++;
+	}
+#endif
+	return 1;
+}
+
+/*
+ * This funcion reads the *physical* memory. The f_pos points directly to the
+ * memory location.
+ */
+static ssize_t read_mem(struct file *file, char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos;
+	ssize_t read = 0, sz;
+	void __iomem *v;
+
+	while (count > 0) {
+		unsigned long remaining;
+
+		sz = size_inside_page(p, count);
+
+		if (!range_is_allowed(p >> PAGE_SHIFT, count))
+			return -EPERM;
+
+		v = ioremap(p, sz);
+		if (IS_ERR(v) || v == NULL) {
+			/*
+			 * Some programs (e.g., dmidecode) groove off into
+			 * weird RAM areas where no tables can possibly exist
+			 * (because Xen will have stomped on them!). These
+			 * programs get rather upset if we let them know that
+			 * Xen failed their access, so we fake out a read of
+			 * all zeroes.
+			 */
+			if (clear_user(buf, count))
+				return -EFAULT;
+			read += count;
+			break;
+		}
+
+		remaining = copy_to_user(buf, v, sz);
+		iounmap(v);
+		if (remaining)
+			return -EFAULT;
+
+		buf += sz;
+		p += sz;
+		count -= sz;
+		read += sz;
+	}
+
+	*ppos += read;
+	return read;
+}
+
+static ssize_t write_mem(struct file *file, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	unsigned long p = *ppos, ignored;
+	ssize_t written = 0, sz;
+	void __iomem *v;
+
+	while (count > 0) {
+		sz = size_inside_page(p, count);
+
+		if (!range_is_allowed(p >> PAGE_SHIFT, sz))
+			return -EPERM;
+
+		v = ioremap(p, sz);
+		if (v == NULL)
+			break;
+		if (IS_ERR(v)) {
+			if (written == 0)
+				return PTR_ERR(v);
+			break;
+		}
+
+		ignored = copy_from_user(v, buf, sz);
+		iounmap(v);
+		if (ignored) {
+			written += sz - ignored;
+			if (written)
+				break;
+			return -EFAULT;
+		}
+		buf += sz;
+		p += sz;
+		count -= sz;
+		written += sz;
+	}
+
+	*ppos += written;
+	return written;
+}
+
+#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
+static struct vm_operations_struct mmap_mem_ops = {
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+	.access = generic_access_phys
+#endif
+};
+
+static int xen_mmap_mem(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+
+	if (uncached_access(file))
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	if (!range_is_allowed(vma->vm_pgoff, size))
+		return -EPERM;
+
+	if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
+						&vma->vm_page_prot))
+		return -EINVAL;
+
+	vma->vm_ops = &mmap_mem_ops;
+
+	/* We want to return the real error code, not EAGAIN. */
+	return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+				      size, vma->vm_page_prot, DOMID_IO);
+}
+#endif
+
+/*
+ * The memory devices use the full 32/64 bits of the offset, and so we cannot
+ * check against negative addresses: they are ok. The return value is weird,
+ * though, in that case (0).
+ *
+ * also note that seeking relative to the "end of file" isn't supported:
+ * it has no meaning, so it returns -EINVAL.
+ */
+static loff_t memory_lseek(struct file *file, loff_t offset, int orig)
+{
+	loff_t ret;
+
+	mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
+	switch (orig) {
+	case SEEK_CUR:
+		offset += file->f_pos;
+	case SEEK_SET:
+		/* to avoid userland mistaking f_pos=-9 as -EBADF=-9 */
+		if ((unsigned long long)offset >= ~0xFFFULL) {
+			ret = -EOVERFLOW;
+			break;
+		}
+		file->f_pos = offset;
+		ret = file->f_pos;
+		force_successful_syscall_return();
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+	return ret;
+}
+
+static int open_mem(struct inode * inode, struct file * filp)
+{
+	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+const struct file_operations mem_fops = {
+	.llseek		= memory_lseek,
+	.read		= read_mem,
+	.write		= write_mem,
+	.mmap		= xen_mmap_mem,
+	.open		= open_mem,
+};
diff --git a/drivers/xen/console/Makefile b/drivers/xen/console/Makefile
new file mode 100644
index 0000000..35de3e93
--- /dev/null
+++ b/drivers/xen/console/Makefile
@@ -0,0 +1,2 @@
+
+obj-y	:= console.o xencons_ring.o
diff --git a/drivers/xen/console/console.c b/drivers/xen/console/console.c
new file mode 100644
index 0000000..7f6ec01
--- /dev/null
+++ b/drivers/xen/console/console.c
@@ -0,0 +1,755 @@
+/******************************************************************************
+ * console.c
+ * 
+ * Virtual console driver.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/sysrq.h>
+#include <linux/vt.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/uaccess.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/xencons.h>
+
+/*
+ * Modes:
+ *  'xencons=off'  [XC_OFF]:     Console is disabled.
+ *  'xencons=tty'  [XC_TTY]:     Console attached to '/dev/tty[0-9]+'.
+ *  'xencons=ttyS' [XC_SERIAL]:  Console attached to '/dev/ttyS[0-9]+'.
+ *  'xencons=xvc'  [XC_XVC]:     Console attached to '/dev/xvc0'.
+ *  'xencons=hvc'  [XC_HVC]:     Console attached to '/dev/hvc0'.
+ *  default:                     XC_XVC
+ * 
+ * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
+ * warnings from standard distro startup scripts.
+ */
+static enum {
+	XC_OFF, XC_TTY, XC_SERIAL, XC_XVC, XC_HVC
+} xc_mode = XC_XVC;
+static int xc_num = -1;
+
+/* /dev/xvc0 device number allocated by lanana.org. */
+#define XEN_XVC_MAJOR 204
+#define XEN_XVC_MINOR 191
+
+/* /dev/hvc0 device number */
+#define XEN_HVC_MAJOR 229
+#define XEN_HVC_MINOR 0
+
+static int __init xencons_setup(char *str)
+{
+	char *q;
+	int n;
+
+	console_use_vt = 1;
+	if (!strncmp(str, "ttyS", 4)) {
+		xc_mode = XC_SERIAL;
+		str += 4;
+	} else if (!strncmp(str, "tty", 3)) {
+		xc_mode = XC_TTY;
+		str += 3;
+		console_use_vt = 0;
+	} else if (!strncmp(str, "xvc", 3)) {
+		xc_mode = XC_XVC;
+		str += 3;
+	} else if (!strncmp(str, "hvc", 3)) {
+		xc_mode = XC_HVC;
+		str += 3;
+	} else if (!strncmp(str, "off", 3)) {
+		xc_mode = XC_OFF;
+		str += 3;
+	}
+
+	n = simple_strtol(str, &q, 10);
+	if (q != str)
+		xc_num = n;
+
+	return 1;
+}
+__setup("xencons=", xencons_setup);
+
+/* The kernel and user-land drivers share a common transmit buffer. */
+static unsigned int wbuf_size = 4096;
+#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
+static char *wbuf;
+static unsigned int wc, wp; /* write_cons, write_prod */
+
+static int __init xencons_bufsz_setup(char *str)
+{
+	unsigned int goal;
+	goal = simple_strtoul(str, NULL, 0);
+	if (goal) {
+		goal = roundup_pow_of_two(goal);
+		if (wbuf_size < goal)
+			wbuf_size = goal;
+	}
+	return 1;
+}
+__setup("xencons_bufsz=", xencons_bufsz_setup);
+
+/* This lock protects accesses to the common transmit buffer. */
+static DEFINE_SPINLOCK(xencons_lock);
+
+/* Common transmit-kick routine. */
+static void __xencons_tx_flush(void);
+
+static struct tty_driver *xencons_driver;
+
+/******************** Kernel console driver ********************************/
+
+static void kcons_write(struct console *c, const char *s, unsigned int count)
+{
+	int           i = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+
+	while (i < count) {
+		for (; i < count; i++) {
+			if ((wp - wc) >= (wbuf_size - 1))
+				break;
+			if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
+				wbuf[WBUF_MASK(wp++)] = '\r';
+		}
+
+		__xencons_tx_flush();
+	}
+
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void kcons_write_dom0(struct console *c, const char *s, unsigned int count)
+{
+
+	while (count > 0) {
+		int rc;
+		rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s);
+		if (rc <= 0)
+			break;
+		count -= rc;
+		s += rc;
+	}
+}
+
+static struct tty_driver *kcons_device(struct console *c, int *index)
+{
+	*index = 0;
+	return xencons_driver;
+}
+
+static struct console kcons_info = {
+	.device	= kcons_device,
+	.flags	= CON_PRINTBUFFER | CON_ENABLED,
+	.index	= -1,
+};
+
+static int __init xen_console_init(void)
+{
+	if (!is_running_on_xen())
+		goto out;
+
+	if (is_initial_xendomain()) {
+		kcons_info.write = kcons_write_dom0;
+	} else {
+		if (!xen_start_info->console.domU.evtchn)
+			goto out;
+		kcons_info.write = kcons_write;
+	}
+
+	switch (xc_mode) {
+	case XC_XVC:
+		strcpy(kcons_info.name, "xvc");
+		if (xc_num == -1)
+			xc_num = 0;
+		break;
+
+	case XC_HVC:
+		strcpy(kcons_info.name, "hvc");
+		if (xc_num == -1)
+			xc_num = 0;
+		if (!is_initial_xendomain())
+			add_preferred_console(kcons_info.name, xc_num, NULL);
+		break;
+
+	case XC_SERIAL:
+		strcpy(kcons_info.name, "ttyS");
+		if (xc_num == -1)
+			xc_num = 0;
+		break;
+
+	case XC_TTY:
+		strcpy(kcons_info.name, "tty");
+		if (xc_num == -1)
+			xc_num = 1;
+		break;
+
+	default:
+		goto out;
+	}
+
+	wbuf = kmalloc(wbuf_size, GFP_KERNEL);
+
+	register_console(&kcons_info);
+
+ out:
+	return 0;
+}
+console_initcall(xen_console_init);
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+/*** Useful function for console debugging -- goes straight to Xen. ***/
+asmlinkage int xprintk(const char *fmt, ...)
+{
+	va_list args;
+	int printk_len;
+	static char printk_buf[1024];
+
+	/* Emit the output into the temporary buffer */
+	va_start(args, fmt);
+	printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+	va_end(args);
+
+	/* Send the processed output directly to Xen. */
+	kcons_write_dom0(NULL, printk_buf, printk_len);
+
+	return 0;
+}
+#endif
+
+/*** Forcibly flush console data before dying. ***/
+void xencons_force_flush(void)
+{
+	int sz;
+
+	/* Emergency console is synchronous, so there's nothing to flush. */
+	if (!is_running_on_xen() ||
+	    is_initial_xendomain() ||
+	    !xen_start_info->console.domU.evtchn)
+		return;
+
+	/* Spin until console data is flushed through to the daemon. */
+	while (wc != wp) {
+		int sent = 0;
+		if ((sz = wp - wc) == 0)
+			continue;
+		sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
+		if (sent > 0)
+			wc += sent;
+	}
+}
+
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#include <linux/screen_info.h>
+
+void __init dom0_init_screen_info(const struct dom0_vga_console_info *info, size_t size)
+{
+	/* This is drawn from a dump from vgacon:startup in
+	 * standard Linux. */
+	screen_info.orig_video_mode = 3;
+	screen_info.orig_video_isVGA = 1;
+	screen_info.orig_video_lines = 25;
+	screen_info.orig_video_cols = 80;
+	screen_info.orig_video_ega_bx = 3;
+	screen_info.orig_video_points = 16;
+	screen_info.orig_y = screen_info.orig_video_lines - 1;
+
+	switch (info->video_type) {
+	case XEN_VGATYPE_TEXT_MODE_3:
+		if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+		           + sizeof(info->u.text_mode_3))
+			break;
+		screen_info.orig_video_lines = info->u.text_mode_3.rows;
+		screen_info.orig_video_cols = info->u.text_mode_3.columns;
+		screen_info.orig_x = info->u.text_mode_3.cursor_x;
+		screen_info.orig_y = info->u.text_mode_3.cursor_y;
+		screen_info.orig_video_points =
+			info->u.text_mode_3.font_height;
+		break;
+
+	case XEN_VGATYPE_VESA_LFB:
+	case XEN_VGATYPE_EFI_LFB:
+		if (size < offsetof(struct dom0_vga_console_info,
+		                    u.vesa_lfb.gbl_caps))
+			break;
+		screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
+		screen_info.lfb_width = info->u.vesa_lfb.width;
+		screen_info.lfb_height = info->u.vesa_lfb.height;
+		screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+		screen_info.lfb_base = info->u.vesa_lfb.lfb_base;
+		screen_info.lfb_size = info->u.vesa_lfb.lfb_size;
+		screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+		screen_info.red_size = info->u.vesa_lfb.red_size;
+		screen_info.red_pos = info->u.vesa_lfb.red_pos;
+		screen_info.green_size = info->u.vesa_lfb.green_size;
+		screen_info.green_pos = info->u.vesa_lfb.green_pos;
+		screen_info.blue_size = info->u.vesa_lfb.blue_size;
+		screen_info.blue_pos = info->u.vesa_lfb.blue_pos;
+		screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size;
+		screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+		if (info->video_type == XEN_VGATYPE_EFI_LFB) {
+			screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
+			break;
+		}
+		if (size >= offsetof(struct dom0_vga_console_info,
+		                     u.vesa_lfb.gbl_caps)
+		            + sizeof(info->u.vesa_lfb.gbl_caps))
+			screen_info.capabilities = info->u.vesa_lfb.gbl_caps;
+		if (size >= offsetof(struct dom0_vga_console_info,
+		                     u.vesa_lfb.mode_attrs)
+		            + sizeof(info->u.vesa_lfb.mode_attrs))
+			screen_info.vesa_attributes = info->u.vesa_lfb.mode_attrs;
+		break;
+	}
+}
+#endif
+
+
+/******************** User-space console driver (/dev/console) ************/
+
+#define DRV(_d)         (_d)
+#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) &&		\
+			 ((_tty)->index != (xc_num - 1)))
+
+static struct ktermios *xencons_termios[MAX_NR_CONSOLES];
+static struct tty_struct *xencons_tty;
+static int xencons_priv_irq;
+static char x_char;
+
+void xencons_rx(char *buf, unsigned len)
+{
+	int           i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	if (xencons_tty == NULL)
+		goto out;
+
+	for (i = 0; i < len; i++) {
+#ifdef CONFIG_MAGIC_SYSRQ
+		static unsigned long sysrq_requested;
+
+		if (buf[i] == '\x0f') { /* ^O */
+			if (!sysrq_requested) {
+				sysrq_requested = jiffies;
+				continue; /* don't print sysrq key */
+			}
+			sysrq_requested = 0;
+		} else if (sysrq_requested) {
+			unsigned long sysrq_timeout = sysrq_requested + HZ*2;
+
+			sysrq_requested = 0;
+			if (time_before(jiffies, sysrq_timeout)) {
+				spin_unlock_irqrestore(&xencons_lock, flags);
+				handle_sysrq(buf[i]);
+				spin_lock_irqsave(&xencons_lock, flags);
+				continue;
+			}
+		}
+#endif
+		tty_insert_flip_char(xencons_tty, buf[i], 0);
+	}
+	tty_flip_buffer_push(xencons_tty);
+
+ out:
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void __xencons_tx_flush(void)
+{
+	int sent, sz, work_done = 0;
+
+	if (x_char) {
+		if (is_initial_xendomain())
+			kcons_write_dom0(NULL, &x_char, 1);
+		else
+			while (x_char)
+				if (xencons_ring_send(&x_char, 1) == 1)
+					break;
+		x_char = 0;
+		work_done = 1;
+	}
+
+	while (wc != wp) {
+		sz = wp - wc;
+		if (sz > (wbuf_size - WBUF_MASK(wc)))
+			sz = wbuf_size - WBUF_MASK(wc);
+		if (is_initial_xendomain()) {
+			kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
+			wc += sz;
+		} else {
+			sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
+			if (sent == 0)
+				break;
+			wc += sent;
+		}
+		work_done = 1;
+	}
+
+	if (work_done && (xencons_tty != NULL)) {
+		wake_up_interruptible(&xencons_tty->write_wait);
+		tty_wakeup(xencons_tty);
+	}
+}
+
+void xencons_tx(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	__xencons_tx_flush();
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+/* Privileged receive callback and transmit kicker. */
+static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id)
+{
+	static char rbuf[16];
+	int         l;
+
+	while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
+		xencons_rx(rbuf, l);
+
+	xencons_tx();
+
+	return IRQ_HANDLED;
+}
+
+static int xencons_write_room(struct tty_struct *tty)
+{
+	return wbuf_size - (wp - wc);
+}
+
+static int xencons_chars_in_buffer(struct tty_struct *tty)
+{
+	return wp - wc;
+}
+
+static void xencons_send_xchar(struct tty_struct *tty, char ch)
+{
+	unsigned long flags;
+
+	if (DUMMY_TTY(tty))
+		return;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	x_char = ch;
+	__xencons_tx_flush();
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_throttle(struct tty_struct *tty)
+{
+	if (DUMMY_TTY(tty))
+		return;
+
+	if (I_IXOFF(tty))
+		xencons_send_xchar(tty, STOP_CHAR(tty));
+}
+
+static void xencons_unthrottle(struct tty_struct *tty)
+{
+	if (DUMMY_TTY(tty))
+		return;
+
+	if (I_IXOFF(tty)) {
+		if (x_char != 0)
+			x_char = 0;
+		else
+			xencons_send_xchar(tty, START_CHAR(tty));
+	}
+}
+
+static void xencons_flush_buffer(struct tty_struct *tty)
+{
+	unsigned long flags;
+
+	if (DUMMY_TTY(tty))
+		return;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	wc = wp = 0;
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static inline int __xencons_put_char(int ch)
+{
+	char _ch = (char)ch;
+	if ((wp - wc) == wbuf_size)
+		return 0;
+	wbuf[WBUF_MASK(wp++)] = _ch;
+	return 1;
+}
+
+static int xencons_write(
+	struct tty_struct *tty,
+	const unsigned char *buf,
+	int count)
+{
+	int i;
+	unsigned long flags;
+
+	if (DUMMY_TTY(tty))
+		return count;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+
+	for (i = 0; i < count; i++)
+		if (!__xencons_put_char(buf[i]))
+			break;
+
+	if (i != 0)
+		__xencons_tx_flush();
+
+	spin_unlock_irqrestore(&xencons_lock, flags);
+
+	return i;
+}
+
+static int xencons_put_char(struct tty_struct *tty, u_char ch)
+{
+	unsigned long flags;
+	int ret;
+
+	if (DUMMY_TTY(tty))
+		return 0;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	ret = __xencons_put_char(ch);
+	spin_unlock_irqrestore(&xencons_lock, flags);
+	return ret;
+}
+
+static void xencons_flush_chars(struct tty_struct *tty)
+{
+	unsigned long flags;
+
+	if (DUMMY_TTY(tty))
+		return;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	__xencons_tx_flush();
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+	unsigned long orig_jiffies = jiffies;
+
+	if (DUMMY_TTY(tty))
+		return;
+
+	while (tty_chars_in_buffer(tty)) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(1);
+		if (signal_pending(current))
+			break;
+		if (timeout && time_after(jiffies, orig_jiffies + timeout))
+			break;
+	}
+
+	set_current_state(TASK_RUNNING);
+}
+
+static int xencons_open(struct tty_struct *tty, struct file *filp)
+{
+	unsigned long flags;
+
+	if (DUMMY_TTY(tty))
+		return 0;
+
+	spin_lock_irqsave(&xencons_lock, flags);
+	tty->driver_data = NULL;
+	if (xencons_tty == NULL)
+		xencons_tty = tty;
+	__xencons_tx_flush();
+	spin_unlock_irqrestore(&xencons_lock, flags);
+
+	return 0;
+}
+
+static void xencons_close(struct tty_struct *tty, struct file *filp)
+{
+	unsigned long flags;
+
+	if (DUMMY_TTY(tty))
+		return;
+
+	/*
+	 * Must follow lock nesting; callers are prepared for this
+	 * (__tty_hangup) or don't care as they drop the lock right after our
+	 * return (tty_release) in order to then acquire both in proper order.
+	 */
+	tty_unlock();
+	mutex_lock(&tty_mutex);
+	tty_lock();
+
+	if (tty->count != 1) {
+		mutex_unlock(&tty_mutex);
+		return;
+	}
+
+	/* Prevent other threads from re-opening this tty. */
+	set_bit(TTY_CLOSING, &tty->flags);
+	mutex_unlock(&tty_mutex);
+
+	tty->closing = 1;
+	tty_wait_until_sent(tty, 0);
+	tty_driver_flush_buffer(tty);
+	if (tty->ldisc->ops->flush_buffer)
+		tty->ldisc->ops->flush_buffer(tty);
+	tty->closing = 0;
+	spin_lock_irqsave(&xencons_lock, flags);
+	xencons_tty = NULL;
+	spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static const struct tty_operations xencons_ops = {
+	.open = xencons_open,
+	.close = xencons_close,
+	.write = xencons_write,
+	.write_room = xencons_write_room,
+	.put_char = xencons_put_char,
+	.flush_chars = xencons_flush_chars,
+	.chars_in_buffer = xencons_chars_in_buffer,
+	.send_xchar = xencons_send_xchar,
+	.flush_buffer = xencons_flush_buffer,
+	.throttle = xencons_throttle,
+	.unthrottle = xencons_unthrottle,
+	.wait_until_sent = xencons_wait_until_sent,
+};
+
+static int __init xencons_init(void)
+{
+	int rc;
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	if (xc_mode == XC_OFF)
+		return 0;
+
+	if (!is_initial_xendomain()) {
+		rc = xencons_ring_init();
+		if (rc)
+			return rc;
+	}
+
+	xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ?
+					  MAX_NR_CONSOLES : 1);
+	if (xencons_driver == NULL)
+		return -ENOMEM;
+
+	DRV(xencons_driver)->name            = "xencons";
+	DRV(xencons_driver)->major           = TTY_MAJOR;
+	DRV(xencons_driver)->type            = TTY_DRIVER_TYPE_SERIAL;
+	DRV(xencons_driver)->subtype         = SERIAL_TYPE_NORMAL;
+	DRV(xencons_driver)->init_termios    = tty_std_termios;
+	DRV(xencons_driver)->flags           =
+		TTY_DRIVER_REAL_RAW |
+		TTY_DRIVER_RESET_TERMIOS;
+	DRV(xencons_driver)->termios         = xencons_termios;
+
+	switch (xc_mode) {
+	case XC_XVC:
+		DRV(xencons_driver)->name        = "xvc";
+		DRV(xencons_driver)->major       = XEN_XVC_MAJOR;
+		DRV(xencons_driver)->minor_start = XEN_XVC_MINOR;
+		DRV(xencons_driver)->name_base   = xc_num;
+		break;
+	case XC_HVC:
+		DRV(xencons_driver)->name        = "hvc";
+		DRV(xencons_driver)->major       = XEN_HVC_MAJOR;
+		DRV(xencons_driver)->minor_start = XEN_HVC_MINOR;
+		DRV(xencons_driver)->name_base   = xc_num;
+		break;
+	case XC_SERIAL:
+		DRV(xencons_driver)->name        = "ttyS";
+		DRV(xencons_driver)->minor_start = 64 + xc_num;
+		DRV(xencons_driver)->name_base   = xc_num;
+		break;
+	default:
+		DRV(xencons_driver)->name        = "tty";
+		DRV(xencons_driver)->minor_start = 1;
+		DRV(xencons_driver)->name_base   = 1;
+		break;
+	}
+
+	tty_set_operations(xencons_driver, &xencons_ops);
+
+	if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
+		pr_warning("WARNING: Failed to register Xen virtual "
+			   "console driver as '%s%d'\n",
+			   DRV(xencons_driver)->name,
+			   DRV(xencons_driver)->name_base);
+		put_tty_driver(xencons_driver);
+		xencons_driver = NULL;
+		return rc;
+	}
+
+	if (is_initial_xendomain()) {
+		xencons_priv_irq = bind_virq_to_irqhandler(
+			VIRQ_CONSOLE,
+			0,
+			xencons_priv_interrupt,
+			0,
+			"console",
+			NULL);
+		BUG_ON(xencons_priv_irq < 0);
+	}
+
+	pr_info("Xen virtual console successfully installed as %s%d\n",
+		DRV(xencons_driver)->name, xc_num);
+
+	return 0;
+}
+
+module_init(xencons_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/console/xencons_ring.c b/drivers/xen/console/xencons_ring.c
new file mode 100644
index 0000000..988a31b
--- /dev/null
+++ b/drivers/xen/console/xencons_ring.c
@@ -0,0 +1,141 @@
+/* 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/ptrace.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/xencons.h>
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <xen/interface/io/console.h>
+
+static int xencons_irq;
+
+static inline struct xencons_interface *xencons_interface(void)
+{
+	return mfn_to_virt(xen_start_info->console.domU.mfn);
+}
+
+static inline void notify_daemon(void)
+{
+	/* Use evtchn: this is called early, before irq is set up. */
+	notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
+}
+
+int xencons_ring_send(const char *data, unsigned len)
+{
+	int sent = 0;
+	struct xencons_interface *intf = xencons_interface();
+	XENCONS_RING_IDX cons, prod;
+
+	cons = intf->out_cons;
+	prod = intf->out_prod;
+	mb();
+	BUG_ON((prod - cons) > sizeof(intf->out));
+
+	while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
+		intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
+
+	wmb();
+	intf->out_prod = prod;
+
+	notify_daemon();
+
+	return sent;
+}
+
+static irqreturn_t handle_input(int irq, void *unused)
+{
+	struct xencons_interface *intf = xencons_interface();
+	XENCONS_RING_IDX cons, prod;
+
+	cons = intf->in_cons;
+	prod = intf->in_prod;
+	mb();
+	BUG_ON((prod - cons) > sizeof(intf->in));
+
+	while (cons != prod) {
+		xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1);
+		cons++;
+	}
+
+	mb();
+	intf->in_cons = cons;
+
+	notify_daemon();
+
+	xencons_tx();
+
+	return IRQ_HANDLED;
+}
+
+int xencons_ring_init(void)
+{
+	int irq;
+
+	if (xencons_irq)
+		unbind_from_irqhandler(xencons_irq, NULL);
+	xencons_irq = 0;
+
+	if (!is_running_on_xen() ||
+	    is_initial_xendomain() ||
+	    !xen_start_info->console.domU.evtchn)
+		return -ENODEV;
+
+	irq = bind_caller_port_to_irqhandler(
+		xen_start_info->console.domU.evtchn,
+		handle_input, 0, "xencons", NULL);
+	if (irq < 0) {
+		pr_err("XEN console request irq failed %i\n", irq);
+		return irq;
+	}
+
+	xencons_irq = irq;
+
+	/* In case we have in-flight data after save/restore... */
+	notify_daemon();
+
+	return 0;
+}
+
+void xencons_resume(void)
+{
+	(void)xencons_ring_init();
+}
diff --git a/drivers/xen/core/Makefile b/drivers/xen/core/Makefile
new file mode 100644
index 0000000..ab06252
--- /dev/null
+++ b/drivers/xen/core/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-y := evtchn.o gnttab.o reboot.o machine_reboot.o
+
+obj-$(CONFIG_XEN_PRIVILEGED_GUEST) += firmware.o pcpu.o
+obj-$(CONFIG_PROC_FS)		+= xen_proc.o
+obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
+obj-$(CONFIG_XEN_SMPBOOT)	+= smpboot.o
+obj-$(CONFIG_SMP)		+= spinlock.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
+obj-$(CONFIG_XEN_DOMCTL)	+= domctl.o
+CFLAGS_domctl.o			:= -D__XEN_PUBLIC_XEN_H__ -D__XEN_PUBLIC_GRANT_TABLE_H__
+CFLAGS_domctl.o			+= -D__XEN_TOOLS__ -imacros xen/interface/domctl.h -imacros xen/interface/sysctl.h
diff --git a/drivers/xen/core/acpi_memhotplug.c b/drivers/xen/core/acpi_memhotplug.c
new file mode 100644
index 0000000..c993a5a
--- /dev/null
+++ b/drivers/xen/core/acpi_memhotplug.c
@@ -0,0 +1,190 @@
+/*
+ *  xen_acpi_memhotplug.c - interface to notify Xen on memory device hotadd
+ *
+ *  Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <xen/interface/platform.h>
+#include <asm/hypervisor.h>
+
+struct xen_hotmem_entry {
+	struct list_head hotmem_list;
+	uint64_t start;
+	uint64_t end;
+	uint32_t flags;
+	uint32_t pxm;
+};
+
+struct xen_hotmem_list {
+	struct list_head list;
+	unsigned int entry_nr;
+};
+
+static struct xen_hotmem_list xen_hotmem = {
+	.list = LIST_HEAD_INIT(xen_hotmem.list)
+};
+static DEFINE_SPINLOCK(xen_hotmem_lock);
+
+static int xen_hyper_addmem(struct xen_hotmem_entry *entry)
+{
+	xen_platform_op_t op;
+
+	op.cmd = XENPF_mem_hotadd;
+	op.u.mem_add.spfn = entry->start >> PAGE_SHIFT;
+	op.u.mem_add.epfn = entry->end >> PAGE_SHIFT;
+	op.u.mem_add.flags = entry->flags;
+	op.u.mem_add.pxm = entry->pxm;
+
+	return HYPERVISOR_platform_op(&op);
+}
+
+static int add_hotmem_entry(int pxm, uint64_t start,
+			uint64_t length, uint32_t flags)
+{
+	struct xen_hotmem_entry *entry;
+
+	if (pxm < 0 || !length)
+		return -EINVAL;
+
+	entry = kzalloc(sizeof(struct xen_hotmem_entry), GFP_ATOMIC);
+	if (!entry)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&entry->hotmem_list);
+	entry->start = start;
+	entry->end = start + length;
+	entry->flags = flags;
+	entry->pxm = pxm;
+
+	spin_lock(&xen_hotmem_lock);
+
+	list_add_tail(&entry->hotmem_list, &xen_hotmem.list);
+	xen_hotmem.entry_nr++;
+
+	spin_unlock(&xen_hotmem_lock);
+
+	return 0;
+}
+
+static int free_hotmem_entry(struct xen_hotmem_entry *entry)
+{
+	list_del(&entry->hotmem_list);
+	kfree(entry);
+
+	return 0;
+}
+
+static void xen_hotadd_mem_dpc(struct work_struct *work)
+{
+	struct list_head *elem, *tmp;
+	struct xen_hotmem_entry *entry;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&xen_hotmem_lock, flags);
+	list_for_each_safe(elem, tmp, &xen_hotmem.list) {
+		entry = list_entry(elem, struct xen_hotmem_entry, hotmem_list);
+		ret = xen_hyper_addmem(entry);
+		if (ret)
+			pr_warn("xen addmem failed with %x\n", ret);
+		free_hotmem_entry(entry);
+		xen_hotmem.entry_nr--;
+	}
+	spin_unlock_irqrestore(&xen_hotmem_lock, flags);
+}
+
+static DECLARE_WORK(xen_hotadd_mem_work, xen_hotadd_mem_dpc);
+
+static int xen_acpi_get_pxm(acpi_handle h)
+{
+	unsigned long long pxm;
+	acpi_status status;
+	acpi_handle handle;
+	acpi_handle phandle = h;
+
+	do {
+		handle = phandle;
+		status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
+		if (ACPI_SUCCESS(status))
+			return pxm;
+		status = acpi_get_parent(handle, &phandle);
+	} while (ACPI_SUCCESS(status));
+
+	return -1;
+}
+
+static int xen_hotadd_memory(struct acpi_memory_device *mem_device)
+{
+	int pxm, result;
+	int num_enabled = 0;
+	struct acpi_memory_info *info;
+
+	if (!mem_device)
+		return -EINVAL;
+
+	pxm = xen_acpi_get_pxm(mem_device->device->handle);
+
+	if (pxm < 0)
+		return -EINVAL;
+
+	/*
+	 * Always return success to ACPI driver, and notify hypervisor later
+	 * because hypervisor will utilize the memory in memory hotadd hypercall
+	 */
+	list_for_each_entry(info, &mem_device->res_list, list) {
+		if (info->enabled) { /* just sanity check...*/
+			num_enabled++;
+			continue;
+		}
+		/*
+		 * If the memory block size is zero, please ignore it.
+		 * Don't try to do the following memory hotplug flowchart.
+		 */
+		if (!info->length)
+			continue;
+
+		result = add_hotmem_entry(pxm, info->start_addr,
+					  info->length, 0);
+		if (result)
+			continue;
+		info->enabled = 1;
+		num_enabled++;
+	}
+
+	if (!num_enabled)
+		return -EINVAL;
+
+	schedule_work(&xen_hotadd_mem_work);
+
+	return 0;
+}
+
+static int xen_hotadd_mem_init(void)
+{
+	if (!is_initial_xendomain())
+		return -ENODEV;
+
+	return 0;
+}
+
+static void xen_hotadd_mem_exit(void)
+{
+	flush_scheduled_work();
+}
diff --git a/drivers/xen/core/clockevents.c b/drivers/xen/core/clockevents.c
new file mode 100644
index 0000000..08816ad
--- /dev/null
+++ b/drivers/xen/core/clockevents.c
@@ -0,0 +1,293 @@
+/*
+ *	Xen clockevent functions
+ *
+ *	See arch/x86/xen/time.c for copyright and credits for derived
+ *	portions of this file.
+ *
+ * Xen clockevent implementation
+ *
+ * Xen has two clockevent implementations:
+ *
+ * The old timer_op one works with all released versions of Xen prior
+ * to version 3.0.4.  This version of the hypervisor provides a
+ * single-shot timer with nanosecond resolution.  However, sharing the
+ * same event channel is a 100Hz tick which is delivered while the
+ * vcpu is running.  We don't care about or use this tick, but it will
+ * cause the core time code to think the timer fired too soon, and
+ * will end up resetting it each time.  It could be filtered, but
+ * doing so has complications when the ktime clocksource is not yet
+ * the xen clocksource (ie, at boot time).
+ *
+ * The new vcpu_op-based timer interface allows the tick timer period
+ * to be changed or turned off.  The tick timer is not useful as a
+ * periodic timer because events are only delivered to running vcpus.
+ * The one-shot timer can report when a timeout is in the past, so
+ * set_next_event is capable of returning -ETIME when appropriate.
+ * This interface is used when available.
+ */
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/math64.h>
+#include <asm/hypervisor.h>
+#include <xen/clock.h>
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP	100000
+#define NS_PER_TICK	(1000000000LL / HZ)
+
+/*
+ * Get a hypervisor absolute time.  In theory we could maintain an
+ * offset between the kernel's time and the hypervisor's time, and
+ * apply that to a kernel's absolute timeout.  Unfortunately the
+ * hypervisor and kernel times can drift even if the kernel is using
+ * the Xen clocksource, because ntp can warp the kernel's clocksource.
+ */
+static u64 get_abs_timeout(unsigned long delta)
+{
+	return xen_local_clock() + delta;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030004
+static void timerop_set_mode(enum clock_event_mode mode,
+			     struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		WARN_ON(1); /* unsupported */
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		if (HYPERVISOR_set_timer_op(0)) /* cancel timeout */
+			BUG();
+		break;
+	}
+}
+
+static int timerop_set_next_event(unsigned long delta,
+				  struct clock_event_device *evt)
+{
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+		BUG();
+
+	/*
+	 * We may have missed the deadline, but there's no real way of
+	 * knowing for sure.  If the event was in the past, then we'll
+	 * get an immediate interrupt.
+	 */
+
+	return 0;
+}
+#endif
+
+static void vcpuop_set_mode(enum clock_event_mode mode,
+			    struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		WARN_ON(1); /* unsupported */
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer,
+				       smp_processor_id(), NULL))
+			BUG();
+		/* fall through */
+	case CLOCK_EVT_MODE_ONESHOT:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+				       smp_processor_id(), NULL))
+			BUG();
+		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+	}
+}
+
+static int vcpuop_set_next_event(unsigned long delta,
+				 struct clock_event_device *evt)
+{
+	struct vcpu_set_singleshot_timer single;
+	int ret;
+
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	single.timeout_abs_ns = get_abs_timeout(delta);
+	single.flags = VCPU_SSHOTTMR_future;
+
+	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer,
+				 smp_processor_id(), &single);
+
+	BUG_ON(ret != 0 && ret != -ETIME);
+
+	return ret;
+}
+
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_event) = {
+	.name		= "xen",
+	.features	= CLOCK_EVT_FEAT_ONESHOT,
+
+	.max_delta_ns	= 0xffffffff,
+	.min_delta_ns	= TIMER_SLOP,
+
+	.mult		= 1,
+	.shift		= 0,
+	.rating		= 500,
+
+	.irq		= -1,
+};
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(u64, runnable_snapshot);
+static DEFINE_PER_CPU(u64, offline_snapshot);
+
+/* unused ns of stolen time */
+static DEFINE_PER_CPU(unsigned int, residual_stolen);
+
+static void init_missing_ticks_accounting(unsigned int cpu)
+{
+	setup_runstate_area(cpu);
+	if (cpu == smp_processor_id()) {
+		this_cpu_write(runnable_snapshot,
+			       this_vcpu_read(runstate.time[RUNSTATE_runnable]));
+		this_cpu_write(offline_snapshot,
+			       this_vcpu_read(runstate.time[RUNSTATE_offline]));
+	}
+	per_cpu(residual_stolen, cpu) = 0;
+}
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
+	u64 runnable, offline;
+	s64 stolen;
+	irqreturn_t ret = IRQ_NONE;
+
+	if (evt->event_handler) {
+		evt->event_handler(evt);
+		ret = IRQ_HANDLED;
+	}
+
+	xen_check_wallclock_update();
+
+	runnable = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+	offline = this_vcpu_read(runstate.time[RUNSTATE_offline]);
+
+	stolen = runnable - __this_cpu_read(runnable_snapshot)
+		 + offline - __this_cpu_read(offline_snapshot)
+		 + __this_cpu_read(residual_stolen);
+
+	if (stolen >= NS_PER_TICK)
+		account_steal_ticks(div_u64_rem(stolen, NS_PER_TICK,
+						&__get_cpu_var(residual_stolen)));
+	else
+		__this_cpu_write(residual_stolen, stolen > 0 ? stolen : 0);
+
+	__this_cpu_write(runnable_snapshot, runnable);
+	__this_cpu_write(offline_snapshot, offline);
+
+	return ret;
+}
+
+static struct irqaction timer_action = {
+	.handler = timer_interrupt,
+	.flags   = IRQF_DISABLED|IRQF_TIMER,
+	.name    = "timer"
+};
+
+void __cpuinit xen_setup_cpu_clockevents(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+	init_missing_ticks_accounting(cpu);
+
+	evt->cpumask = cpumask_of(cpu);
+	clockevents_register_device(evt);
+}
+
+#ifdef CONFIG_SMP
+int __cpuinit local_setup_timer(unsigned int cpu)
+{
+	struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+	BUG_ON(cpu == smp_processor_id());
+
+	evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
+	if (evt->irq < 0)
+		return evt->irq;
+	BUG_ON(per_cpu(xen_clock_event.irq, 0) != evt->irq);
+
+	evt->set_mode = percpu_read(xen_clock_event.set_mode);
+	evt->set_next_event = percpu_read(xen_clock_event.set_next_event);
+
+	return 0;
+}
+
+void __cpuinit local_teardown_timer(unsigned int cpu)
+{
+	struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+	BUG_ON(cpu == 0);
+	unbind_from_per_cpu_irq(evt->irq, cpu, &timer_action);
+}
+#endif
+
+void xen_clockevents_resume(void)
+{
+	unsigned int cpu;
+
+	if (percpu_read(xen_clock_event.set_mode) != vcpuop_set_mode)
+		return;
+
+	for_each_online_cpu(cpu) {
+		init_missing_ticks_accounting(cpu);
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+			BUG();
+	}
+}
+
+void __init xen_clockevents_init(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
+
+	switch (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+				   cpu, NULL)) {
+	case 0:
+		/*
+		 * Successfully turned off 100Hz tick, so we have the
+		 * vcpuop-based timer interface
+		 */
+		evt->set_mode = vcpuop_set_mode;
+		evt->set_next_event = vcpuop_set_next_event;
+		break;
+#if CONFIG_XEN_COMPAT <= 0x030004
+	case -ENOSYS:
+		printk(KERN_DEBUG "Xen: using timerop interface\n");
+		evt->set_mode = timerop_set_mode;
+		evt->set_next_event = timerop_set_next_event;
+		break;
+#endif
+	default:
+		BUG();
+	}
+
+	evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
+	BUG_ON(evt->irq < 0);
+
+	xen_setup_cpu_clockevents();
+}
diff --git a/drivers/xen/core/cpu_hotplug.c b/drivers/xen/core/cpu_hotplug.c
new file mode 100644
index 0000000..171ab0e
--- /dev/null
+++ b/drivers/xen/core/cpu_hotplug.c
@@ -0,0 +1,182 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kobject.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <xen/cpu_hotplug.h>
+#include <xen/xenbus.h>
+
+/*
+ * Set of CPUs that remote admin software will allow us to bring online.
+ * Notified to us via xenbus.
+ */
+static cpumask_var_t xenbus_allowed_cpumask;
+
+/* Set of CPUs that local admin will allow us to bring online. */
+static cpumask_var_t local_allowed_cpumask;
+
+static int local_cpu_hotplug_request(void)
+{
+	/*
+	 * We assume a CPU hotplug request comes from local admin if it is made
+	 * via a userspace process (i.e., one with a real mm_struct).
+	 */
+	return (current->mm != NULL);
+}
+
+static void __cpuinit vcpu_hotplug(unsigned int cpu, struct device *dev)
+{
+	int err;
+	char dir[32], state[32];
+
+	if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
+		return;
+
+	sprintf(dir, "cpu/%u", cpu);
+	err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
+	if (err != 1) {
+		pr_err("XENBUS: Unable to read cpu state\n");
+		return;
+	}
+
+	if (strcmp(state, "online") == 0) {
+		cpumask_set_cpu(cpu, xenbus_allowed_cpumask);
+		if (!cpu_up(cpu) && dev)
+			kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+	} else if (strcmp(state, "offline") == 0) {
+		cpumask_clear_cpu(cpu, xenbus_allowed_cpumask);
+		if (!cpu_down(cpu) && dev)
+			kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+	} else {
+		pr_err("XENBUS: unknown state(%s) on CPU%d\n",
+		       state, cpu);
+	}
+}
+
+static void __cpuinit handle_vcpu_hotplug_event(
+	struct xenbus_watch *watch, const char **vec, unsigned int len)
+{
+	unsigned int cpu;
+	char *cpustr;
+	const char *node = vec[XS_WATCH_PATH];
+
+	if ((cpustr = strstr(node, "cpu/")) != NULL) {
+		sscanf(cpustr, "cpu/%u", &cpu);
+		vcpu_hotplug(cpu, get_cpu_device(cpu));
+	}
+}
+
+static int smpboot_cpu_notify(struct notifier_block *notifier,
+			      unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	/*
+	 * We do this in a callback notifier rather than __cpu_disable()
+	 * because local_cpu_hotplug_request() does not work in the latter
+	 * as it's always executed from within a stopmachine kthread.
+	 */
+	if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
+		cpumask_clear_cpu(cpu, local_allowed_cpumask);
+
+	return NOTIFY_OK;
+}
+
+static int __cpuinit setup_cpu_watcher(struct notifier_block *notifier,
+				       unsigned long event, void *data)
+{
+	unsigned int i;
+
+	static struct xenbus_watch __cpuinitdata cpu_watch = {
+		.node = "cpu",
+		.callback = handle_vcpu_hotplug_event,
+		.flags = XBWF_new_thread };
+	(void)register_xenbus_watch(&cpu_watch);
+
+	if (!is_initial_xendomain()) {
+		for_each_possible_cpu(i)
+			vcpu_hotplug(i, get_cpu_device(i));
+		pr_info("Brought up %ld CPUs\n", (long)num_online_cpus());
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int __init setup_vcpu_hotplug_event(void)
+{
+	static struct notifier_block hotplug_cpu = {
+		.notifier_call = smpboot_cpu_notify };
+	static struct notifier_block __cpuinitdata xsn_cpu = {
+		.notifier_call = setup_cpu_watcher };
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	register_cpu_notifier(&hotplug_cpu);
+	register_xenstore_notifier(&xsn_cpu);
+
+	return 0;
+}
+
+arch_initcall(setup_vcpu_hotplug_event);
+
+int __ref smp_suspend(void)
+{
+	unsigned int cpu;
+	int err;
+
+	for_each_online_cpu(cpu) {
+		if (cpu == 0)
+			continue;
+		err = cpu_down(cpu);
+		if (err) {
+			pr_crit("Failed to take all CPUs down: %d\n", err);
+			for_each_possible_cpu(cpu)
+				vcpu_hotplug(cpu, NULL);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+void __ref smp_resume(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu == 0)
+			continue;
+		vcpu_hotplug(cpu, NULL);
+	}
+}
+
+int cpu_up_check(unsigned int cpu)
+{
+	int rc = 0;
+
+	if (local_cpu_hotplug_request()) {
+		cpumask_set_cpu(cpu, local_allowed_cpumask);
+		if (!cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
+			pr_warning("%s: attempt to bring up CPU %u disallowed "
+				   "by remote admin.\n", __FUNCTION__, cpu);
+			rc = -EBUSY;
+		}
+	} else if (!cpumask_test_cpu(cpu, local_allowed_cpumask) ||
+		   !cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
+		rc = -EBUSY;
+	}
+
+	return rc;
+}
+
+void __init init_xenbus_allowed_cpumask(void)
+{
+	if (!alloc_cpumask_var(&xenbus_allowed_cpumask, GFP_KERNEL))
+		BUG();
+	cpumask_copy(xenbus_allowed_cpumask, cpu_present_mask);
+	if (!alloc_cpumask_var(&local_allowed_cpumask, GFP_KERNEL))
+		BUG();
+	cpumask_setall(local_allowed_cpumask);
+}
diff --git a/drivers/xen/core/domctl.c b/drivers/xen/core/domctl.c
new file mode 100644
index 0000000..842b52c
--- /dev/null
+++ b/drivers/xen/core/domctl.c
@@ -0,0 +1,562 @@
+/*
+ * !!!  dirty hack alert  !!!
+ *
+ * Problem: old guests kernels don't have a "protocol" node
+ *          in the frontend xenstore directory, so mixing
+ *          32 and 64bit domains doesn't work.
+ *
+ * Upstream plans to solve this in the tools, by letting them
+ * create a protocol node.  Which certainly makes sense.
+ * But it isn't trivial and isn't done yet.  Too bad.
+ *
+ * So for the time being we use the get_address_size domctl
+ * hypercall for a pretty good guess.  Not nice as the domctl
+ * hypercall isn't supposed to be used by the kernel.  Because
+ * we don't want to have dependencies between dom0 kernel and
+ * xen kernel versions.  Now we have one.  Ouch.
+ */
+#undef __XEN_PUBLIC_XEN_H__
+#undef __XEN_PUBLIC_GRANT_TABLE_H__
+#undef __XEN_TOOLS__
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+
+#include "domctl.h"
+
+/* stuff copied from xen/interface/domctl.h, which we can't
+ * include directly for the reasons outlined above .... */
+
+typedef struct xen_domctl_address_size {
+	uint32_t size;
+} xen_domctl_address_size_t;
+
+typedef __attribute__((aligned(8))) uint64_t uint64_aligned_t;
+
+struct xenctl_cpumap_v4 {
+	XEN_GUEST_HANDLE(uint8) bitmap;
+	uint32_t nr_cpus;
+};
+
+struct xenctl_cpumap_v5 {
+	union {
+		XEN_GUEST_HANDLE(uint8) bitmap;
+		uint64_aligned_t _align;
+	};
+	uint32_t nr_cpus;
+};
+
+struct xen_domctl_vcpuaffinity_v4 {
+    uint32_t vcpu;
+    struct xenctl_cpumap_v4 cpumap;
+};
+
+struct xen_domctl_vcpuaffinity_v5 {
+    uint32_t vcpu;
+    struct xenctl_cpumap_v5 cpumap;
+};
+
+union xen_domctl {
+	/* v4: sle10 sp1: xen 3.0.4 + 32-on-64 patches */
+	struct {
+		uint32_t cmd;
+		uint32_t interface_version;
+		domid_t  domain;
+		union {
+			/* left out lots of other struct xen_domctl_foobar */
+			struct xen_domctl_address_size       address_size;
+			struct xen_domctl_vcpuaffinity_v4    vcpu_affinity;
+			uint64_t                             dummy_align;
+			uint8_t                              dummy_pad[128];
+		};
+	} v4;
+
+	/*
+	 * v5: upstream: xen 3.1
+	 * v6: upstream: xen 4.0
+	 * v7: upstream: xen 4.1; sle11 sp1: xen 4.0 + cpupools patches
+	 * v8: upstream: xen 4.2
+	 */
+	struct {
+		uint32_t cmd;
+		uint32_t interface_version;
+		domid_t  domain;
+		union {
+			struct xen_domctl_address_size       address_size;
+			struct xen_domctl_vcpuaffinity_v5    vcpu_affinity;
+			uint64_aligned_t                     dummy_align;
+			uint8_t                              dummy_pad[128];
+		};
+	} v5, v6, v7, v8;
+};
+
+struct xen_sysctl_physinfo_v6 {
+	uint32_t threads_per_core;
+	uint32_t cores_per_socket;
+	uint32_t nr_cpus;
+	uint32_t nr_nodes;
+	uint32_t cpu_khz;
+	uint64_aligned_t total_pages;
+	uint64_aligned_t free_pages;
+	uint64_aligned_t scrub_pages;
+	uint32_t hw_cap[8];
+	uint32_t max_cpu_id;
+	union {
+		XEN_GUEST_HANDLE(uint32) cpu_to_node;
+		uint64_aligned_t _ctn_align;
+	};
+	uint32_t capabilities;
+};
+
+struct xen_sysctl_physinfo_v7 {
+	uint32_t threads_per_core;
+	uint32_t cores_per_socket;
+	uint32_t nr_cpus;
+	uint32_t max_node_id;
+	uint32_t cpu_khz;
+	uint64_aligned_t total_pages;
+	uint64_aligned_t free_pages;
+	uint64_aligned_t scrub_pages;
+	uint32_t hw_cap[8];
+	uint32_t max_cpu_id;
+	union {
+		XEN_GUEST_HANDLE(uint32) cpu_to_node;
+		uint64_aligned_t _ctn_align;
+	};
+	uint32_t capabilities;
+};
+
+#define XEN_SYSCTL_pm_op_get_cputopo 0x20
+struct xen_get_cputopo_v6 {
+	uint32_t max_cpus;
+	union {
+		XEN_GUEST_HANDLE(uint32) cpu_to_core;
+		uint64_aligned_t _ctc_align;
+	};
+	union {
+		XEN_GUEST_HANDLE(uint32) cpu_to_socket;
+		uint64_aligned_t _cts_align;
+	};
+	uint32_t nr_cpus;
+};
+
+struct xen_sysctl_pm_op_v6 {
+	uint32_t cmd;
+	uint32_t cpuid;
+	union {
+		struct xen_get_cputopo_v6 get_topo;
+	};
+};
+#define xen_sysctl_pm_op_v7 xen_sysctl_pm_op_v6
+
+struct xen_sysctl_topologyinfo_v8 {
+	uint32_t max_cpu_index;
+	union {
+		XEN_GUEST_HANDLE(uint32) cpu_to_core;
+		uint64_aligned_t _ctc_align;
+	};
+	union {
+		XEN_GUEST_HANDLE(uint32) cpu_to_socket;
+		uint64_aligned_t _cts_align;
+	};
+	union {
+		XEN_GUEST_HANDLE(uint32) cpu_to_node;
+		uint64_aligned_t _ctn_align;
+	};
+};
+
+union xen_sysctl {
+	/* v6: Xen 3.4.x */
+	struct {
+		uint32_t cmd;
+		uint32_t interface_version;
+		union {
+			struct xen_sysctl_physinfo_v6 physinfo;
+			struct xen_sysctl_pm_op_v6 pm_op;
+		};
+	} v6;
+	/* v7: Xen 4.0.x */
+	struct {
+		uint32_t cmd;
+		uint32_t interface_version;
+		union {
+			struct xen_sysctl_physinfo_v7 physinfo;
+			struct xen_sysctl_pm_op_v7 pm_op;
+		};
+	} v7;
+	/*
+	 * v8: Xen 4.1.x
+	 * v9: Xen 4.2+
+	 */
+	struct {
+		uint32_t cmd;
+		uint32_t interface_version;
+		union {
+			struct xen_sysctl_topologyinfo_v8 topologyinfo;
+		};
+	} v8, v9;
+};
+
+/* The actual code comes here */
+
+static inline int hypervisor_domctl(void *domctl)
+{
+	return _hypercall1(int, domctl, domctl);
+}
+
+static inline int hypervisor_sysctl(void *sysctl)
+{
+	return _hypercall1(int, sysctl, sysctl);
+}
+
+int xen_guest_address_size(int domid)
+{
+	union xen_domctl domctl;
+	int low, ret;
+
+#define guest_address_size(ver) do {					\
+	memset(&domctl, 0, sizeof(domctl));				\
+	domctl.v##ver.cmd = XEN_DOMCTL_get_address_size;		\
+	domctl.v##ver.interface_version = low = ver;			\
+	domctl.v##ver.domain = domid;					\
+	ret = hypervisor_domctl(&domctl) ?: domctl.v##ver.address_size.size; \
+	if (ret == 32 || ret == 64) {					\
+		pr_info("v" #ver " domctl worked ok: dom%d is %d-bit\n",\
+			domid, ret);					\
+		return ret;						\
+	}								\
+} while (0)
+
+	BUILD_BUG_ON(XEN_DOMCTL_INTERFACE_VERSION > 8);
+	guest_address_size(8);
+#if CONFIG_XEN_COMPAT < 0x040200
+	guest_address_size(7);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040100
+	guest_address_size(6);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040000
+	guest_address_size(5);
+#endif
+#if CONFIG_XEN_COMPAT < 0x030100
+	guest_address_size(4);
+#endif
+
+	ret = BITS_PER_LONG;
+	pr_warn("v%d...%d domctls failed, assuming dom%d is native: %d\n",
+		low, XEN_DOMCTL_INTERFACE_VERSION, domid, ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xen_guest_address_size);
+
+int xen_guest_blkif_protocol(int domid)
+{
+	int address_size = xen_guest_address_size(domid);
+
+	if (address_size == BITS_PER_LONG)
+		return BLKIF_PROTOCOL_NATIVE;
+	if (address_size == 32)
+		return BLKIF_PROTOCOL_X86_32;
+	if (address_size == 64)
+		return BLKIF_PROTOCOL_X86_64;
+	return BLKIF_PROTOCOL_NATIVE;
+}
+EXPORT_SYMBOL_GPL(xen_guest_blkif_protocol);
+
+#ifdef CONFIG_X86
+
+#define vcpuaffinity(what, ver) ({					\
+	memset(&domctl, 0, sizeof(domctl));				\
+	domctl.v##ver.cmd = XEN_DOMCTL_##what##vcpuaffinity;		\
+	domctl.v##ver.interface_version = ver;				\
+	/* domctl.v##ver.domain = 0; */					\
+	domctl.v##ver.vcpu_affinity.vcpu = smp_processor_id();		\
+	domctl.v##ver.vcpu_affinity.cpumap.nr_cpus = nr;		\
+	set_xen_guest_handle(domctl.v##ver.vcpu_affinity.cpumap.bitmap, \
+			     mask);					\
+	hypervisor_domctl(&domctl);					\
+})
+
+static inline int get_vcpuaffinity(unsigned int nr, void *mask)
+{
+	union xen_domctl domctl;
+	int rc;
+
+	BUILD_BUG_ON(XEN_DOMCTL_INTERFACE_VERSION > 8);
+	rc = vcpuaffinity(get, 8);
+#if CONFIG_XEN_COMPAT < 0x040200
+	if (rc)
+		rc = vcpuaffinity(get, 7);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040100
+	if (rc)
+		rc = vcpuaffinity(get, 6);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040000
+	if (rc)
+		rc = vcpuaffinity(get, 5);
+#endif
+#if CONFIG_XEN_COMPAT < 0x030100
+	if (rc)
+		rc = vcpuaffinity(get, 4);
+#endif
+	return rc;
+}
+
+static inline int set_vcpuaffinity(unsigned int nr, void *mask)
+{
+	union xen_domctl domctl;
+	int rc;
+
+	BUILD_BUG_ON(XEN_DOMCTL_INTERFACE_VERSION > 8);
+	rc = vcpuaffinity(set, 8);
+#if CONFIG_XEN_COMPAT < 0x040200
+	if (rc)
+		rc = vcpuaffinity(set, 7);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040100
+	if (rc)
+		rc = vcpuaffinity(set, 6);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040000
+	if (rc)
+		rc = vcpuaffinity(set, 5);
+#endif
+#if CONFIG_XEN_COMPAT < 0x030100
+	if (rc)
+		rc = vcpuaffinity(set, 4);
+#endif
+	return rc;
+}
+
+static DEFINE_PER_CPU(void *, saved_pcpu_affinity);
+
+#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_LONG / sizeof(long))
+
+int xen_set_physical_cpu_affinity(int pcpu)
+{
+	int rc;
+
+	if (!is_initial_xendomain())
+		return -EPERM;
+
+	if (pcpu >= 0) {
+		void *oldmap;
+
+		if (pcpu > BITS_PER_PAGE)
+			return -ERANGE;
+
+		if (percpu_read(saved_pcpu_affinity))
+			return -EBUSY;
+
+		oldmap = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!oldmap)
+			return -ENOMEM;
+
+		rc = get_vcpuaffinity(BITS_PER_PAGE, oldmap);
+		if (!rc) {
+			void *newmap = kcalloc(BITS_TO_LONGS(pcpu + 1),
+					       sizeof(long), GFP_KERNEL);
+
+			if (newmap) {
+				__set_bit(pcpu, newmap);
+				rc = set_vcpuaffinity(pcpu + 1, newmap);
+				kfree(newmap);
+			} else
+				rc = -ENOMEM;
+		}
+
+		if (!rc)
+			percpu_write(saved_pcpu_affinity, oldmap);
+		else
+			free_page((unsigned long)oldmap);
+	} else {
+		if (!percpu_read(saved_pcpu_affinity))
+			return 0;
+		rc = set_vcpuaffinity(BITS_PER_PAGE,
+				      percpu_read(saved_pcpu_affinity));
+		free_page((unsigned long)percpu_read(saved_pcpu_affinity));
+		percpu_write(saved_pcpu_affinity, NULL);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(xen_set_physical_cpu_affinity);
+
+int xen_get_topology_info(unsigned int cpu, u32 *core, u32 *sock, u32 *node)
+{
+	union xen_sysctl sysctl;
+	uint32_t *cores = NULL, *socks = NULL, *nodes = NULL;
+	unsigned int nr;
+	int rc;
+
+	if (core)
+		cores = kmalloc((cpu + 1) * sizeof(*cores), GFP_KERNEL);
+	if (sock)
+		socks = kmalloc((cpu + 1) * sizeof(*socks), GFP_KERNEL);
+	if (node)
+		nodes = kmalloc((cpu + 1) * sizeof(*nodes), GFP_KERNEL);
+	if ((core && !cores) || (sock && !socks) || (node && !nodes)) {
+		kfree(cores);
+		kfree(socks);
+		kfree(nodes);
+		return -ENOMEM;
+	}
+
+#define topologyinfo(ver) do {						\
+	memset(&sysctl, 0, sizeof(sysctl));				\
+	sysctl.v##ver.cmd = XEN_SYSCTL_topologyinfo;			\
+	sysctl.v##ver.interface_version = ver;				\
+	sysctl.v##ver.topologyinfo.max_cpu_index = cpu;			\
+	set_xen_guest_handle(sysctl.v##ver.topologyinfo.cpu_to_core,	\
+			     cores);					\
+	set_xen_guest_handle(sysctl.v##ver.topologyinfo.cpu_to_socket,	\
+			     socks);					\
+	set_xen_guest_handle(sysctl.v##ver.topologyinfo.cpu_to_node,	\
+			     nodes);					\
+	rc = hypervisor_sysctl(&sysctl);				\
+	nr = sysctl.v##ver.topologyinfo.max_cpu_index + 1;		\
+} while (0)
+
+	BUILD_BUG_ON(XEN_SYSCTL_INTERFACE_VERSION > 9);
+	topologyinfo(9);
+#if CONFIG_XEN_COMPAT < 0x040200
+	if (rc)
+		topologyinfo(8);
+#endif
+
+#if CONFIG_XEN_COMPAT < 0x040100
+#define pm_op_cputopo(ver) do {						\
+	memset(&sysctl, 0, sizeof(sysctl));				\
+	sysctl.v##ver.cmd = XEN_SYSCTL_pm_op;				\
+	sysctl.v##ver.interface_version = ver;				\
+	sysctl.v##ver.pm_op.cmd = XEN_SYSCTL_pm_op_get_cputopo;		\
+	sysctl.v##ver.pm_op.cpuid = 0;					\
+	sysctl.v##ver.pm_op.get_topo.max_cpus = cpu + 1;		\
+	set_xen_guest_handle(sysctl.v##ver.pm_op.get_topo.cpu_to_core,	\
+			     cores);					\
+	set_xen_guest_handle(sysctl.v##ver.pm_op.get_topo.cpu_to_socket,\
+			     socks);					\
+	rc = hypervisor_sysctl(&sysctl);				\
+	memset(&sysctl, 0, sizeof(sysctl));				\
+	sysctl.v##ver.cmd = XEN_SYSCTL_physinfo;			\
+	sysctl.v##ver.interface_version = ver;				\
+	sysctl.v##ver.physinfo.max_cpu_id = cpu;			\
+	set_xen_guest_handle(sysctl.v##ver.physinfo.cpu_to_node, nodes);\
+	rc = hypervisor_sysctl(&sysctl) ?: rc;				\
+	nr = sysctl.v##ver.physinfo.max_cpu_id + 1;			\
+} while (0)
+
+	if (rc)
+		pm_op_cputopo(7);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040000
+	if (rc)
+		pm_op_cputopo(6);
+#endif
+
+	if (!rc && cpu >= nr)
+		rc = -EDOM;
+
+	if (!rc && core && (*core = cores[cpu]) == INVALID_TOPOLOGY_ID)
+		rc = -ENOENT;
+	kfree(cores);
+
+	if (!rc && sock && (*sock = socks[cpu]) == INVALID_TOPOLOGY_ID)
+		rc = -ENOENT;
+	kfree(socks);
+
+	if (!rc && node && (*node = nodes[cpu]) == INVALID_TOPOLOGY_ID)
+		rc = -ENOENT;
+	kfree(nodes);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(xen_get_topology_info);
+
+#include <xen/pcpu.h>
+#include <asm/msr.h>
+
+int rdmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no, u32 *l, u32 *h)
+{
+	int err = xen_set_physical_cpu_affinity(pcpu);
+
+	switch (err) {
+	case 0:
+		err = rdmsr_safe(msr_no, l, h);
+		WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+		break;
+	case -EINVAL:
+		/* Fall back in case this is due to dom0_vcpus_pinned. */
+		err = rdmsr_safe_on_cpu(pcpu, msr_no, l, h) ?: 1;
+		break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(rdmsr_safe_on_pcpu);
+
+int wrmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no, u32 l, u32 h)
+{
+	int err = xen_set_physical_cpu_affinity(pcpu);
+
+	switch (err) {
+	case 0:
+		err = wrmsr_safe(msr_no, l, h);
+		WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+		break;
+	case -EINVAL:
+		/* Fall back in case this is due to dom0_vcpus_pinned. */
+		err = wrmsr_safe_on_cpu(pcpu, msr_no, l, h) ?: 1;
+		break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(wrmsr_safe_on_pcpu);
+
+int rdmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs)
+{
+	int err = xen_set_physical_cpu_affinity(pcpu);
+
+	switch (err) {
+	case 0:
+		err = rdmsr_safe_regs(regs);
+		WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+		break;
+	case -EINVAL:
+		/* Fall back in case this is due to dom0_vcpus_pinned. */
+		err = rdmsr_safe_regs_on_cpu(pcpu, regs) ?: 1;
+		break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(rdmsr_safe_regs_on_pcpu);
+
+int wrmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs)
+{
+	int err = xen_set_physical_cpu_affinity(pcpu);
+
+	switch (err) {
+	case 0:
+		err = wrmsr_safe_regs(regs);
+		WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+		break;
+	case -EINVAL:
+		/* Fall back in case this is due to dom0_vcpus_pinned. */
+		err = wrmsr_safe_regs_on_cpu(pcpu, regs) ?: 1;
+		break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(wrmsr_safe_regs_on_pcpu);
+
+#endif /* CONFIG_X86 */
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/core/domctl.h b/drivers/xen/core/domctl.h
new file mode 100644
index 0000000..e8a26a2
--- /dev/null
+++ b/drivers/xen/core/domctl.h
@@ -0,0 +1,4 @@
+int xen_guest_address_size(int domid);
+int xen_guest_blkif_protocol(int domid);
+int xen_set_physical_cpu_affinity(int pcpu);
+int xen_get_topology_info(unsigned int cpu, u32 *core, u32 *socket, u32 *node);
diff --git a/drivers/xen/core/evtchn.c b/drivers/xen/core/evtchn.c
new file mode 100644
index 0000000..50bcecf
--- /dev/null
+++ b/drivers/xen/core/evtchn.c
@@ -0,0 +1,1992 @@
+/******************************************************************************
+ * evtchn.c
+ * 
+ * Communication via Xen event channels.
+ * 
+ * Copyright (c) 2002-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/ftrace.h>
+#include <linux/atomic.h>
+#include <asm/system.h>
+#include <asm/ptrace.h>
+#include <asm/sync_bitops.h>
+#include <xen/evtchn.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/physdev.h>
+#include <asm/hypervisor.h>
+#include <linux/mc146818rtc.h> /* RTC_IRQ */
+#include "../../../kernel/irq/internals.h" /* IRQS_AUTODETECT, IRQS_PENDING */
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> event-channel mappings. */
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+	[0 ...  NR_EVENT_CHANNELS-1] = -1 };
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86)
+static struct percpu_irqaction {
+	struct irqaction action; /* must be first */
+	struct percpu_irqaction *next;
+	cpumask_var_t cpus;
+} *virq_actions[NR_VIRQS];
+/* IRQ <-> VIRQ mapping. */
+static DECLARE_BITMAP(virq_per_cpu, NR_VIRQS) __read_mostly;
+static DEFINE_PER_CPU_READ_MOSTLY(int[NR_VIRQS], virq_to_evtchn);
+#define BUG_IF_VIRQ_PER_CPU(irq_cfg) \
+	BUG_ON(type_from_irq_cfg(irq_cfg) == IRQT_VIRQ \
+	       && test_bit(index_from_irq_cfg(irq_cfg), virq_per_cpu))
+#else
+#define BUG_IF_VIRQ_PER_CPU(irq_cfg) ((void)0)
+#define PER_CPU_VIRQ_IRQ
+#endif
+
+/* IRQ <-> IPI mapping. */
+#if defined(CONFIG_SMP) && defined(CONFIG_X86)
+static int __read_mostly ipi_irq = -1;
+DEFINE_PER_CPU(DECLARE_BITMAP(, NR_IPIS), ipi_pending);
+static DEFINE_PER_CPU_READ_MOSTLY(evtchn_port_t, ipi_evtchn);
+#else
+#define PER_CPU_IPI_IRQ
+#endif
+#if !defined(CONFIG_SMP) || !defined(PER_CPU_IPI_IRQ)
+#define BUG_IF_IPI(irq_cfg) BUG_ON(type_from_irq_cfg(irq_cfg) == IRQT_IPI)
+#else
+#define BUG_IF_IPI(irq_cfg) ((void)0)
+#endif
+
+/* Binding types. */
+enum {
+	IRQT_UNBOUND,
+	IRQT_PIRQ,
+	IRQT_VIRQ,
+	IRQT_IPI,
+	IRQT_LOCAL_PORT,
+	IRQT_CALLER_PORT,
+	_IRQT_COUNT
+};
+
+#define _IRQT_BITS 4
+#define _EVTCHN_BITS 12
+#define _INDEX_BITS (32 - _IRQT_BITS - _EVTCHN_BITS)
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND	(IRQT_UNBOUND << (32 - _IRQT_BITS))
+
+static struct irq_cfg _irq_cfg[] = {
+	[0 ...
+#ifdef CONFIG_SPARSE_IRQ
+	       BUILD_BUG_ON_ZERO(PIRQ_BASE) + NR_IRQS_LEGACY
+#else
+	       NR_IRQS
+#endif
+		       - 1].info = IRQ_UNBOUND
+};
+
+static inline struct irq_cfg *__pure irq_cfg(unsigned int irq)
+{
+#ifdef CONFIG_SPARSE_IRQ
+	return irq_get_chip_data(irq);
+#else
+	return irq < NR_IRQS ? _irq_cfg + irq : NULL;
+#endif
+}
+
+static inline struct irq_cfg *__pure irq_data_cfg(struct irq_data *data)
+{
+	return irq_data_get_irq_chip_data(data);
+}
+
+/* Constructor for packed IRQ information. */
+static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+	BUILD_BUG_ON(_IRQT_COUNT > (1U << _IRQT_BITS));
+
+	BUILD_BUG_ON(NR_PIRQS > (1U << _INDEX_BITS));
+	BUILD_BUG_ON(NR_VIRQS > (1U << _INDEX_BITS));
+#if defined(PER_CPU_IPI_IRQ) && defined(NR_IPIS)
+	BUILD_BUG_ON(NR_IPIS > (1U << _INDEX_BITS));
+#endif
+	BUG_ON(index >> _INDEX_BITS);
+
+	BUILD_BUG_ON(NR_EVENT_CHANNELS > (1U << _EVTCHN_BITS));
+
+	return ((type << (32 - _IRQT_BITS)) | (index << _EVTCHN_BITS) | evtchn);
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+
+static inline unsigned int index_from_irq_cfg(const struct irq_cfg *cfg)
+{
+	return (cfg->info >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1);
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+	const struct irq_cfg *cfg = irq_cfg(irq);
+
+	return cfg ? index_from_irq_cfg(cfg) : 0;
+}
+
+static inline unsigned int type_from_irq_cfg(const struct irq_cfg *cfg)
+{
+	return cfg->info >> (32 - _IRQT_BITS);
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+	const struct irq_cfg *cfg = irq_cfg(irq);
+
+	return cfg ? type_from_irq_cfg(cfg) : IRQT_UNBOUND;
+}
+
+static inline unsigned int evtchn_from_per_cpu_irq(const struct irq_cfg *cfg,
+						   unsigned int cpu)
+{
+	switch (type_from_irq_cfg(cfg)) {
+#ifndef PER_CPU_VIRQ_IRQ
+	case IRQT_VIRQ:
+		return per_cpu(virq_to_evtchn, cpu)[index_from_irq_cfg(cfg)];
+#endif
+#ifndef PER_CPU_IPI_IRQ
+	case IRQT_IPI:
+		return per_cpu(ipi_evtchn, cpu);
+#endif
+	}
+	BUG();
+	return 0;
+}
+
+static inline unsigned int evtchn_from_irq_cfg(const struct irq_cfg *cfg)
+{
+	switch (type_from_irq_cfg(cfg)) {
+#ifndef PER_CPU_VIRQ_IRQ
+	case IRQT_VIRQ:
+#endif
+#ifndef PER_CPU_IPI_IRQ
+	case IRQT_IPI:
+#endif
+		return evtchn_from_per_cpu_irq(cfg, smp_processor_id());
+	}
+	return cfg->info & ((1U << _EVTCHN_BITS) - 1);
+}
+
+static inline unsigned int evtchn_from_irq_data(struct irq_data *data)
+{
+	const struct irq_cfg *cfg = irq_data_cfg(data);
+
+	return cfg ? evtchn_from_irq_cfg(cfg) : 0;
+}
+
+static inline unsigned int evtchn_from_irq(int irq)
+{
+	struct irq_data *data = irq_get_irq_data(irq);
+
+	return data ? evtchn_from_irq_data(data) : 0;
+}
+
+unsigned int irq_from_evtchn(unsigned int port)
+{
+	return evtchn_to_irq[port];
+}
+EXPORT_SYMBOL_GPL(irq_from_evtchn);
+
+/* IRQ <-> VIRQ mapping. */
+DEFINE_PER_CPU(int[NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
+
+#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
+/* IRQ <-> IPI mapping. */
+#ifndef NR_IPIS
+#define NR_IPIS 1
+#endif
+DEFINE_PER_CPU(int[NR_IPIS], ipi_to_irq) = {[0 ... NR_IPIS-1] = -1};
+#endif
+
+#ifdef CONFIG_SMP
+
+#if CONFIG_NR_CPUS <= 256
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+#else
+static u16 cpu_evtchn[NR_EVENT_CHANNELS];
+#endif
+static DEFINE_PER_CPU(unsigned long[BITS_TO_LONGS(NR_EVENT_CHANNELS)],
+		      cpu_evtchn_mask);
+
+static inline unsigned long active_evtchns(unsigned int idx)
+{
+	shared_info_t *sh = HYPERVISOR_shared_info;
+
+	return (sh->evtchn_pending[idx] &
+		percpu_read(cpu_evtchn_mask[idx]) &
+		~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	int irq = evtchn_to_irq[chn];
+
+	BUG_ON(!test_bit(chn, s->evtchn_mask));
+
+	if (irq != -1) {
+		struct irq_data *data = irq_get_irq_data(irq);
+
+		if (!irqd_is_per_cpu(data))
+			cpumask_copy(data->affinity, cpumask_of(cpu));
+		else
+			cpumask_set_cpu(cpu, data->affinity);
+	}
+
+	clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_evtchn[chn]));
+	set_bit(chn, per_cpu(cpu_evtchn_mask, cpu));
+	cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+	int i;
+
+	/* By default all event channels notify CPU#0. */
+	for (i = 0; i < nr_irqs; i++) {
+		struct irq_data *data = irq_get_irq_data(i);
+
+		if (data)
+			cpumask_copy(data->affinity, cpumask_of(0));
+	}
+
+	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+	for_each_possible_cpu(i)
+		memset(per_cpu(cpu_evtchn_mask, i), -!i,
+		       sizeof(per_cpu(cpu_evtchn_mask, i)));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+	return cpu_evtchn[evtchn];
+}
+
+#else
+
+static inline unsigned long active_evtchns(unsigned int idx)
+{
+	shared_info_t *sh = HYPERVISOR_shared_info;
+
+	return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+	return 0;
+}
+
+#endif
+
+#ifdef CONFIG_X86
+void __init xen_init_IRQ(void);
+void __init init_IRQ(void)
+{
+	irq_ctx_init(0);
+	xen_init_IRQ();
+}
+#include <asm/idle.h>
+#endif
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)	((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+	VOID(HYPERVISOR_xen_version(0, NULL));
+}
+/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
+EXPORT_SYMBOL(force_evtchn_callback);
+
+static DEFINE_PER_CPU(unsigned int, upcall_count);
+static DEFINE_PER_CPU(unsigned int, current_l1i);
+static DEFINE_PER_CPU(unsigned int, current_l2i);
+
+#ifndef vcpu_info_xchg
+#define vcpu_info_xchg(fld, val) xchg(&current_vcpu_info()->fld, val)
+#endif
+
+/* NB. Interrupts are disabled on entry. */
+asmlinkage void __irq_entry evtchn_do_upcall(struct pt_regs *regs)
+{
+	unsigned long       l1, l2;
+	unsigned long       masked_l1, masked_l2;
+	unsigned int        l1i, l2i, start_l1i, start_l2i, port, i;
+	int                 irq;
+	struct pt_regs     *old_regs;
+
+	/* Nested invocations bail immediately. */
+	if (unlikely(this_cpu_inc_return(upcall_count) != 1))
+		return;
+
+	old_regs = set_irq_regs(regs);
+	xen_spin_irq_enter();
+	irq_enter();
+	exit_idle();
+
+	do {
+		/* Avoid a callback storm when we reenable delivery. */
+		vcpu_info_write(evtchn_upcall_pending, 0);
+
+#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+		/* Clear master flag /before/ clearing selector flag. */
+		wmb();
+#else
+		barrier();
+#endif
+
+#ifndef CONFIG_NO_HZ
+		/*
+		 * Handle timer interrupts before all others, so that all
+		 * hardirq handlers see an up-to-date system time even if we
+		 * have just woken from a long idle period.
+		 */
+#ifdef PER_CPU_VIRQ_IRQ
+		if ((irq = percpu_read(virq_to_irq[VIRQ_TIMER])) != -1) {
+			port = evtchn_from_irq(irq);
+#else
+		port = percpu_read(virq_to_evtchn[VIRQ_TIMER]);
+		if (VALID_EVTCHN(port)) {
+#endif
+			l1i = port / BITS_PER_LONG;
+			l2i = port % BITS_PER_LONG;
+			if (active_evtchns(l1i) & (1ul<<l2i)) {
+				mask_evtchn(port);
+				clear_evtchn(port);
+#ifndef PER_CPU_VIRQ_IRQ
+				irq = evtchn_to_irq[port];
+				BUG_ON(irq == -1);
+#endif
+				if (!handle_irq(irq, regs))
+					BUG();
+			}
+		}
+#endif /* CONFIG_NO_HZ */
+
+		l1 = vcpu_info_xchg(evtchn_pending_sel, 0);
+
+		start_l1i = l1i = percpu_read(current_l1i);
+		start_l2i = percpu_read(current_l2i);
+
+		for (i = 0; l1 != 0; i++) {
+			masked_l1 = l1 & ((~0UL) << l1i);
+			/* If we masked out all events, wrap to beginning. */
+			if (masked_l1 == 0) {
+				l1i = l2i = 0;
+				continue;
+			}
+			l1i = __ffs(masked_l1);
+
+			l2 = active_evtchns(l1i);
+			l2i = 0; /* usually scan entire word from start */
+			if (l1i == start_l1i) {
+				/* We scan the starting word in two parts. */
+				if (i == 0)
+					/* 1st time: start in the middle */
+					l2i = start_l2i;
+				else
+					/* 2nd time: mask bits done already */
+					l2 &= (1ul << start_l2i) - 1;
+			}
+
+			do {
+				bool handled = false;
+
+				masked_l2 = l2 & ((~0UL) << l2i);
+				if (masked_l2 == 0)
+					break;
+				l2i = __ffs(masked_l2);
+
+				/* process port */
+				port = (l1i * BITS_PER_LONG) + l2i;
+				mask_evtchn(port);
+				if ((irq = evtchn_to_irq[port]) != -1) {
+#ifndef PER_CPU_IPI_IRQ
+					if (port != percpu_read(ipi_evtchn))
+#endif
+						clear_evtchn(port);
+					handled = handle_irq(irq, regs);
+				}
+				if (!handled && printk_ratelimit())
+					pr_emerg("No handler for irq %d"
+						 " (port %u)\n",
+						 irq, port);
+
+				l2i = (l2i + 1) % BITS_PER_LONG;
+
+				/* Next caller starts at last processed + 1 */
+				percpu_write(current_l1i,
+					l2i ? l1i : (l1i + 1) % BITS_PER_LONG);
+				percpu_write(current_l2i, l2i);
+
+			} while (l2i != 0);
+
+			/* Scan start_l1i twice; all others once. */
+			if ((l1i != start_l1i) || (i != 0))
+				l1 &= ~(1UL << l1i);
+
+			l1i = (l1i + 1) % BITS_PER_LONG;
+		}
+
+		/* If there were nested callbacks then we have more to do. */
+	} while (unlikely(this_cpu_xchg(upcall_count, 1) != 1));
+
+	this_cpu_write(upcall_count, 0);
+	irq_exit();
+	xen_spin_irq_exit();
+	set_irq_regs(old_regs);
+}
+
+static int find_unbound_irq(unsigned int node, struct irq_cfg **pcfg,
+			    struct irq_chip *chip, bool percpu)
+{
+	static int warned;
+	int irq;
+
+	for (irq = DYNIRQ_BASE; irq < nr_irqs; irq++) {
+		struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
+		struct irq_data *data = irq_get_irq_data(irq);
+
+		if (unlikely(!cfg))
+			return -ENOMEM;
+		if (data->chip != &no_irq_chip &&
+		    data->chip != chip)
+			continue;
+
+		if (!cfg->bindcount) {
+			irq_flow_handler_t handle;
+			const char *name;
+
+			*pcfg = cfg;
+			irq_set_noprobe(irq);
+			if (!percpu) {
+				handle = handle_fasteoi_irq;
+				name = "fasteoi";
+			} else {
+				handle = handle_percpu_irq;
+				name = "percpu";
+			}
+			irq_set_chip_and_handler_name(irq, chip,
+						      handle, name);
+			return irq;
+		}
+	}
+
+	if (!warned) {
+		warned = 1;
+		pr_warning("No available IRQ to bind to: "
+			   "increase NR_DYNIRQS.\n");
+	}
+
+	return -ENOSPC;
+}
+
+static struct irq_chip dynirq_chip;
+
+static int bind_caller_port_to_irq(unsigned int caller_port)
+{
+	struct irq_cfg *cfg;
+	int irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if ((irq = evtchn_to_irq[caller_port]) == -1) {
+		if ((irq = find_unbound_irq(numa_node_id(), &cfg,
+					    &dynirq_chip, false)) < 0)
+			goto out;
+
+		evtchn_to_irq[caller_port] = irq;
+		cfg->info = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port);
+	} else
+		cfg = irq_cfg(irq);
+
+	cfg->bindcount++;
+
+ out:
+	spin_unlock(&irq_mapping_update_lock);
+	return irq;
+}
+
+static int bind_local_port_to_irq(unsigned int local_port)
+{
+	struct irq_cfg *cfg;
+	int irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	BUG_ON(evtchn_to_irq[local_port] != -1);
+
+	if ((irq = find_unbound_irq(numa_node_id(), &cfg, &dynirq_chip,
+				    false)) < 0) {
+		if (close_evtchn(local_port))
+			BUG();
+		goto out;
+	}
+
+	evtchn_to_irq[local_port] = irq;
+	cfg->info = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port);
+	cfg->bindcount++;
+
+ out:
+	spin_unlock(&irq_mapping_update_lock);
+	return irq;
+}
+
+static int bind_listening_port_to_irq(unsigned int remote_domain)
+{
+	struct evtchn_alloc_unbound alloc_unbound;
+	int err;
+
+	alloc_unbound.dom        = DOMID_SELF;
+	alloc_unbound.remote_dom = remote_domain;
+
+	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+					  &alloc_unbound);
+
+	return err ? : bind_local_port_to_irq(alloc_unbound.port);
+}
+
+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+					  unsigned int remote_port)
+{
+	struct evtchn_bind_interdomain bind_interdomain;
+	int err;
+
+	bind_interdomain.remote_dom  = remote_domain;
+	bind_interdomain.remote_port = remote_port;
+
+	err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+					  &bind_interdomain);
+
+	return err ? : bind_local_port_to_irq(bind_interdomain.local_port);
+}
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+	struct evtchn_bind_virq bind_virq;
+	struct irq_cfg *cfg;
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
+		if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
+					    &dynirq_chip, false)) < 0)
+			goto out;
+
+		bind_virq.virq = virq;
+		bind_virq.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq) != 0)
+			BUG();
+		evtchn = bind_virq.port;
+
+		evtchn_to_irq[evtchn] = irq;
+#ifndef PER_CPU_VIRQ_IRQ
+		{
+			unsigned int cpu;
+
+			for_each_possible_cpu(cpu)
+				per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
+		}
+#endif
+		cfg->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+		per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	} else
+		cfg = irq_cfg(irq);
+
+	cfg->bindcount++;
+
+ out:
+	spin_unlock(&irq_mapping_update_lock);
+	return irq;
+}
+
+#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	struct irq_cfg *cfg;
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
+		if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
+					    &dynirq_chip, false)) < 0)
+			goto out;
+
+		bind_ipi.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+						&bind_ipi) != 0)
+			BUG();
+		evtchn = bind_ipi.port;
+
+		evtchn_to_irq[evtchn] = irq;
+		cfg->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+		per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	} else
+		cfg = irq_cfg(irq);
+
+	cfg->bindcount++;
+
+ out:
+	spin_unlock(&irq_mapping_update_lock);
+	return irq;
+}
+#endif
+
+static void unbind_from_irq(unsigned int irq)
+{
+	struct irq_cfg *cfg = irq_cfg(irq);
+	unsigned int evtchn = evtchn_from_irq_cfg(cfg);
+
+	BUG_IF_VIRQ_PER_CPU(cfg);
+	BUG_IF_IPI(cfg);
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if (!--cfg->bindcount && VALID_EVTCHN(evtchn)) {
+		if ((type_from_irq_cfg(cfg) != IRQT_CALLER_PORT) &&
+		    close_evtchn(evtchn))
+			BUG();
+
+		switch (type_from_irq_cfg(cfg)) {
+		case IRQT_VIRQ:
+			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+				[index_from_irq_cfg(cfg)] = -1;
+#ifndef PER_CPU_VIRQ_IRQ
+			{
+				unsigned int cpu;
+
+				for_each_possible_cpu(cpu)
+					per_cpu(virq_to_evtchn, cpu)
+						[index_from_irq_cfg(cfg)] = 0;
+			}
+#endif
+			break;
+#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
+		case IRQT_IPI:
+			per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
+				[index_from_irq_cfg(cfg)] = -1;
+			break;
+#endif
+		default:
+			break;
+		}
+
+		/* Closed ports are implicitly re-bound to VCPU0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+
+		evtchn_to_irq[evtchn] = -1;
+		cfg->info = IRQ_UNBOUND;
+
+		dynamic_irq_cleanup(irq);
+	}
+
+	spin_unlock(&irq_mapping_update_lock);
+}
+
+#if !defined(PER_CPU_IPI_IRQ) || !defined(PER_CPU_VIRQ_IRQ)
+static inline struct percpu_irqaction *alloc_percpu_irqaction(gfp_t gfp)
+{
+	struct percpu_irqaction *new = kzalloc(sizeof(*new), GFP_ATOMIC);
+
+	if (new && !zalloc_cpumask_var(&new->cpus, gfp)) {
+		kfree(new);
+		new = NULL;
+	}
+	return new;
+}
+
+static inline void free_percpu_irqaction(struct percpu_irqaction *action)
+{
+	if (!action)
+		return;
+	free_cpumask_var(action->cpus);
+	kfree(action);
+}
+
+void unbind_from_per_cpu_irq(unsigned int irq, unsigned int cpu,
+			     struct irqaction *action)
+{
+	struct evtchn_close close;
+	struct irq_data *data = irq_get_irq_data(irq);
+	struct irq_cfg *cfg = irq_data_cfg(data);
+	unsigned int evtchn = evtchn_from_per_cpu_irq(cfg, cpu);
+	struct percpu_irqaction *free_action = NULL;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if (VALID_EVTCHN(evtchn)) {
+		mask_evtchn(evtchn);
+
+		BUG_ON(cfg->bindcount <= 1);
+		cfg->bindcount--;
+
+#ifndef PER_CPU_VIRQ_IRQ
+		if (type_from_irq_cfg(cfg) == IRQT_VIRQ) {
+			unsigned int virq = index_from_irq_cfg(cfg);
+			struct percpu_irqaction *cur, *prev = NULL;
+
+			cur = virq_actions[virq];
+			while (cur) {
+				if (cur->action.dev_id == action) {
+					cpumask_clear_cpu(cpu, cur->cpus);
+					if (cpumask_empty(cur->cpus)) {
+						WARN_ON(free_action);
+						if (prev)
+							prev->next = cur->next;
+						else
+							virq_actions[virq]
+								= cur->next;
+						free_action = cur;
+					}
+				} else if (cpumask_test_cpu(cpu, cur->cpus))
+					evtchn = 0;
+				cur = (prev = cur)->next;
+			}
+			if (!VALID_EVTCHN(evtchn))
+				goto done;
+		}
+#endif
+
+		cpumask_clear_cpu(cpu, data->affinity);
+
+		close.port = evtchn;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
+			BUG();
+
+		switch (type_from_irq_cfg(cfg)) {
+#ifndef PER_CPU_VIRQ_IRQ
+		case IRQT_VIRQ:
+			per_cpu(virq_to_evtchn, cpu)
+				[index_from_irq_cfg(cfg)] = 0;
+			break;
+#endif
+#ifndef PER_CPU_IPI_IRQ
+		case IRQT_IPI:
+			per_cpu(ipi_evtchn, cpu) = 0;
+			break;
+#endif
+		default:
+			BUG();
+			break;
+		}
+
+		/* Closed ports are implicitly re-bound to VCPU0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+
+		evtchn_to_irq[evtchn] = -1;
+	}
+
+#ifndef PER_CPU_VIRQ_IRQ
+done:
+#endif
+	spin_unlock(&irq_mapping_update_lock);
+
+	if (free_action) {
+		cpumask_t *cpus = free_action->cpus;
+
+		free_irq(irq, free_action->action.dev_id);
+		free_cpumask_var(cpus);
+	}
+}
+EXPORT_SYMBOL_GPL(unbind_from_per_cpu_irq);
+#endif /* !PER_CPU_IPI_IRQ || !PER_CPU_VIRQ_IRQ */
+
+int bind_caller_port_to_irqhandler(
+	unsigned int caller_port,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_caller_port_to_irq(caller_port);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_caller_port_to_irqhandler);
+
+int bind_listening_port_to_irqhandler(
+	unsigned int remote_domain,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_listening_port_to_irq(remote_domain);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_listening_port_to_irqhandler);
+
+int bind_interdomain_evtchn_to_irqhandler(
+	unsigned int remote_domain,
+	unsigned int remote_port,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(
+	unsigned int virq,
+	unsigned int cpu,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id)
+{
+	int irq, retval;
+
+#ifndef PER_CPU_VIRQ_IRQ
+	BUG_ON(test_bit(virq, virq_per_cpu));
+#endif
+
+	irq = bind_virq_to_irq(virq, cpu);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+#ifdef CONFIG_SMP
+#ifndef PER_CPU_VIRQ_IRQ
+int bind_virq_to_irqaction(
+	unsigned int virq,
+	unsigned int cpu,
+	struct irqaction *action)
+{
+	struct evtchn_bind_virq bind_virq;
+	struct irq_cfg *cfg;
+	unsigned int evtchn;
+	int irq, retval = 0;
+	struct percpu_irqaction *cur = NULL, *new;
+
+	BUG_ON(!test_bit(virq, virq_per_cpu));
+
+	if (action->dev_id)
+		return -EINVAL;
+
+	new = alloc_percpu_irqaction(GFP_ATOMIC);
+	if (new) {
+		new->action = *action;
+		new->action.dev_id = action;
+	}
+
+	spin_lock(&irq_mapping_update_lock);
+
+	for (cur = virq_actions[virq]; cur; cur = cur->next)
+		if (cur->action.dev_id == action)
+			break;
+	if (!cur) {
+		if (!new) {
+			spin_unlock(&irq_mapping_update_lock);
+			return -ENOMEM;
+		}
+		new->next = virq_actions[virq];
+		virq_actions[virq] = cur = new;
+		new = NULL;
+		retval = 1;
+	}
+	cpumask_set_cpu(cpu, cur->cpus);
+	action = &cur->action;
+
+	if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
+		unsigned int nr;
+
+		BUG_ON(!retval);
+
+		if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
+					    &dynirq_chip, true)) < 0) {
+			virq_actions[virq] = cur->next;
+			spin_unlock(&irq_mapping_update_lock);
+			free_percpu_irqaction(new);
+			return irq;
+		}
+
+		/* Extra reference so count will never drop to zero. */
+		cfg->bindcount++;
+
+		for_each_possible_cpu(nr)
+			per_cpu(virq_to_irq, nr)[virq] = irq;
+		cfg->info = mk_irq_info(IRQT_VIRQ, virq, 0);
+	} else
+		cfg = irq_cfg(irq);
+
+	evtchn = per_cpu(virq_to_evtchn, cpu)[virq];
+	if (!VALID_EVTCHN(evtchn)) {
+		bind_virq.virq = virq;
+		bind_virq.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq) != 0)
+			BUG();
+		evtchn = bind_virq.port;
+		evtchn_to_irq[evtchn] = irq;
+		per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	cfg->bindcount++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	free_percpu_irqaction(new);
+
+	if (retval == 0) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		unmask_evtchn(evtchn);
+		local_irq_restore(flags);
+	} else {
+		action->flags |= IRQF_PERCPU;
+		retval = setup_irq(irq, action);
+		if (retval) {
+			unbind_from_per_cpu_irq(irq, cpu, action);
+			BUG_ON(retval > 0);
+			irq = retval;
+		}
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqaction);
+#endif
+
+#ifdef PER_CPU_IPI_IRQ
+int bind_ipi_to_irqhandler(
+	unsigned int ipi,
+	unsigned int cpu,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_ipi_to_irq(ipi, cpu);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags | IRQF_NO_SUSPEND,
+			     devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+#else
+int __cpuinit bind_ipi_to_irqaction(
+	unsigned int cpu,
+	struct irqaction *action)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	struct irq_cfg *cfg;
+	unsigned int evtchn;
+	int retval = 0;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if (VALID_EVTCHN(per_cpu(ipi_evtchn, cpu))) {
+		spin_unlock(&irq_mapping_update_lock);
+		return -EBUSY;
+	}
+
+	if (ipi_irq < 0) {
+		if ((ipi_irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
+						&dynirq_chip, true)) < 0) {
+			spin_unlock(&irq_mapping_update_lock);
+			return ipi_irq;
+		}
+
+		/* Extra reference so count will never drop to zero. */
+		cfg->bindcount++;
+
+		cfg->info = mk_irq_info(IRQT_IPI, 0, 0);
+		retval = 1;
+	} else
+		cfg = irq_cfg(ipi_irq);
+
+	bind_ipi.vcpu = cpu;
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi))
+		BUG();
+
+	evtchn = bind_ipi.port;
+	evtchn_to_irq[evtchn] = ipi_irq;
+	per_cpu(ipi_evtchn, cpu) = evtchn;
+
+	bind_evtchn_to_cpu(evtchn, cpu);
+
+	cfg->bindcount++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	if (retval == 0) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		unmask_evtchn(evtchn);
+		local_irq_restore(flags);
+	} else {
+		action->flags |= IRQF_PERCPU | IRQF_NO_SUSPEND;
+		retval = setup_irq(ipi_irq, action);
+		if (retval) {
+			unbind_from_per_cpu_irq(ipi_irq, cpu, NULL);
+			BUG_ON(retval > 0);
+			ipi_irq = retval;
+		}
+	}
+
+	return ipi_irq;
+}
+#endif /* PER_CPU_IPI_IRQ */
+#endif /* CONFIG_SMP */
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+	free_irq(irq, dev_id);
+	unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+#ifdef CONFIG_SMP
+static int set_affinity_irq(struct irq_data *data,
+			    const struct cpumask *dest, bool force)
+{
+	const struct irq_cfg *cfg = irq_data_cfg(data);
+	unsigned int port = evtchn_from_irq_cfg(cfg);
+	unsigned int cpu = cpumask_any(dest);
+	struct evtchn_bind_vcpu ebv = { .port = port, .vcpu = cpu };
+	bool masked;
+	int rc;
+
+	BUG_IF_VIRQ_PER_CPU(cfg);
+	BUG_IF_IPI(cfg);
+
+	if (!VALID_EVTCHN(port))
+		return -ENXIO;
+
+	masked = test_and_set_evtchn_mask(port);
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &ebv);
+	if (rc == 0) {
+		bind_evtchn_to_cpu(port, cpu);
+		rc = evtchn_to_irq[port] != -1 ? IRQ_SET_MASK_OK_NOCOPY
+					       : IRQ_SET_MASK_OK;
+	}
+	if (!masked)
+		unmask_evtchn(port);
+
+	return rc;
+}
+#endif
+
+int resend_irq_on_evtchn(struct irq_data *data)
+{
+	unsigned int evtchn = evtchn_from_irq_data(data);
+	bool masked;
+
+	if (!VALID_EVTCHN(evtchn))
+		return 1;
+
+	masked = test_and_set_evtchn_mask(evtchn);
+	set_evtchn(evtchn);
+	if (!masked)
+		unmask_evtchn(evtchn);
+
+	return 1;
+}
+
+/*
+ * Interface to generic handling in irq.c
+ */
+
+static void unmask_dynirq(struct irq_data *data)
+{
+	unsigned int evtchn = evtchn_from_irq_data(data);
+
+	if (VALID_EVTCHN(evtchn))
+		unmask_evtchn(evtchn);
+}
+
+static void mask_dynirq(struct irq_data *data)
+{
+	unsigned int evtchn = evtchn_from_irq_data(data);
+
+	if (VALID_EVTCHN(evtchn))
+		mask_evtchn(evtchn);
+}
+
+static unsigned int startup_dynirq(struct irq_data *data)
+{
+	unmask_dynirq(data);
+	return 0;
+}
+
+#define shutdown_dynirq mask_dynirq
+
+static void end_dynirq(struct irq_data *data)
+{
+	if (!irqd_irq_disabled(data)) {
+		irq_move_masked_irq(data);
+		unmask_dynirq(data);
+	}
+}
+
+static struct irq_chip dynirq_chip = {
+	.name             = "Dynamic",
+	.irq_startup      = startup_dynirq,
+	.irq_shutdown     = shutdown_dynirq,
+	.irq_enable       = unmask_dynirq,
+	.irq_disable      = mask_dynirq,
+	.irq_mask         = mask_dynirq,
+	.irq_unmask       = unmask_dynirq,
+	.irq_eoi          = end_dynirq,
+#ifdef CONFIG_SMP
+	.irq_set_affinity = set_affinity_irq,
+#endif
+	.irq_retrigger    = resend_irq_on_evtchn,
+};
+
+/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
+static bool pirq_eoi_does_unmask;
+static unsigned long *pirq_needs_eoi;
+static DECLARE_BITMAP(probing_pirq, NR_PIRQS);
+
+static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
+{
+	struct physdev_eoi eoi = { .irq = evtchn_get_xen_pirq(irq) };
+
+	if (pirq_eoi_does_unmask) {
+		if (test_bit(eoi.irq, pirq_needs_eoi))
+			VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi));
+		else
+			unmask_evtchn(evtchn);
+	} else if (test_bit(irq - PIRQ_BASE, pirq_needs_eoi)) {
+		if (smp_processor_id() != cpu_from_evtchn(evtchn)) {
+			struct evtchn_unmask unmask = { .port = evtchn };
+			struct multicall_entry mcl[2];
+
+			mcl[0].op = __HYPERVISOR_event_channel_op;
+			mcl[0].args[0] = EVTCHNOP_unmask;
+			mcl[0].args[1] = (unsigned long)&unmask;
+			mcl[1].op = __HYPERVISOR_physdev_op;
+			mcl[1].args[0] = PHYSDEVOP_eoi;
+			mcl[1].args[1] = (unsigned long)&eoi;
+
+			if (HYPERVISOR_multicall(mcl, 2))
+				BUG();
+		} else {
+			unmask_evtchn(evtchn);
+			VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi));
+		}
+	} else
+		unmask_evtchn(evtchn);
+}
+
+static inline void pirq_query_unmask(int irq)
+{
+	struct physdev_irq_status_query irq_status;
+
+	if (pirq_eoi_does_unmask)
+		return;
+	irq_status.irq = evtchn_get_xen_pirq(irq);
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+		irq_status.flags = 0;
+	clear_bit(irq - PIRQ_BASE, pirq_needs_eoi);
+	if (irq_status.flags & XENIRQSTAT_needs_eoi)
+		set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
+}
+
+static int set_type_pirq(struct irq_data *data, unsigned int type)
+{
+	if (type != IRQ_TYPE_PROBE)
+		return -EINVAL;
+	set_bit(data->irq - PIRQ_BASE, probing_pirq);
+	return 0;
+}
+
+static void enable_pirq(struct irq_data *data)
+{
+	struct evtchn_bind_pirq bind_pirq;
+	struct irq_cfg *cfg = irq_data_cfg(data);
+	unsigned int evtchn = evtchn_from_irq_cfg(cfg);
+	unsigned int irq = data->irq, pirq = irq - PIRQ_BASE;
+
+	if (VALID_EVTCHN(evtchn)) {
+		if (pirq < nr_pirqs)
+			clear_bit(pirq, probing_pirq);
+		goto out;
+	}
+
+	bind_pirq.pirq = evtchn_get_xen_pirq(irq);
+	/* NB. We are happy to share unless we are probing. */
+	bind_pirq.flags = (pirq < nr_pirqs
+			   && test_and_clear_bit(pirq, probing_pirq))
+			  || (irq_to_desc(irq)->istate & IRQS_AUTODETECT)
+			  ? 0 : BIND_PIRQ__WILL_SHARE;
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
+		if (bind_pirq.flags)
+			pr_info("Failed to obtain physical IRQ %d\n", irq);
+		return;
+	}
+	evtchn = bind_pirq.port;
+
+	pirq_query_unmask(irq);
+
+	evtchn_to_irq[evtchn] = irq;
+	bind_evtchn_to_cpu(evtchn, 0);
+	cfg->info = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn);
+
+ out:
+	pirq_unmask_and_notify(evtchn, irq);
+}
+
+#define disable_pirq mask_pirq
+
+static unsigned int startup_pirq(struct irq_data *data)
+{
+	enable_pirq(data);
+	return 0;
+}
+
+static void shutdown_pirq(struct irq_data *data)
+{
+	struct irq_cfg *cfg = irq_data_cfg(data);
+	unsigned int evtchn = evtchn_from_irq_cfg(cfg);
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	mask_evtchn(evtchn);
+
+	if (close_evtchn(evtchn))
+		BUG();
+
+	bind_evtchn_to_cpu(evtchn, 0);
+	evtchn_to_irq[evtchn] = -1;
+	cfg->info = mk_irq_info(IRQT_PIRQ, index_from_irq_cfg(cfg), 0);
+}
+
+static void unmask_pirq(struct irq_data *data)
+{
+	unsigned int evtchn = evtchn_from_irq_data(data);
+
+	if (VALID_EVTCHN(evtchn))
+		pirq_unmask_and_notify(evtchn, data->irq);
+}
+
+#define mask_pirq mask_dynirq
+
+static void end_pirq(struct irq_data *data)
+{
+	bool disabled = irqd_irq_disabled(data);
+
+	if (disabled && (irq_to_desc(data->irq)->istate & IRQS_PENDING))
+		shutdown_pirq(data);
+	else {
+		if (!disabled)
+			irq_move_masked_irq(data);
+		unmask_pirq(data);
+	}
+}
+
+static struct irq_chip pirq_chip = {
+	.name             = "Phys",
+	.irq_startup      = startup_pirq,
+	.irq_shutdown     = shutdown_pirq,
+	.irq_enable       = enable_pirq,
+	.irq_disable      = disable_pirq,
+	.irq_mask         = mask_pirq,
+	.irq_unmask       = unmask_pirq,
+	.irq_eoi          = end_pirq,
+	.irq_set_type     = set_type_pirq,
+#ifdef CONFIG_SMP
+	.irq_set_affinity = set_affinity_irq,
+#endif
+	.irq_retrigger    = resend_irq_on_evtchn,
+};
+
+int irq_ignore_unhandled(unsigned int irq)
+{
+	struct physdev_irq_status_query irq_status = { .irq = irq };
+
+	if (!is_running_on_xen() || irq >= nr_pirqs)
+		return 0;
+
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+		return 0;
+	return !!(irq_status.flags & XENIRQSTAT_shared);
+}
+
+#if defined(CONFIG_SMP) && !defined(PER_CPU_IPI_IRQ)
+void notify_remote_via_ipi(unsigned int ipi, unsigned int cpu)
+{
+	unsigned int evtchn = per_cpu(ipi_evtchn, cpu);
+
+#ifdef NMI_VECTOR
+	if (ipi == NMI_VECTOR) {
+		int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL);
+
+		if (rc)
+			pr_warn_once("Unable (%d) to send NMI to CPU#%u\n",
+				     rc, cpu);
+		return;
+	}
+#endif
+
+	if (VALID_EVTCHN(evtchn)
+	    && !test_and_set_bit(ipi, per_cpu(ipi_pending, cpu))
+	    && !test_evtchn(evtchn))
+		notify_remote_via_evtchn(evtchn);
+}
+
+void clear_ipi_evtchn(void)
+{
+	unsigned int evtchn = percpu_read(ipi_evtchn);
+
+	BUG_ON(!VALID_EVTCHN(evtchn));
+	clear_evtchn(evtchn);
+}
+#endif
+
+void notify_remote_via_irq(int irq)
+{
+	const struct irq_cfg *cfg = irq_cfg(irq);
+	unsigned int evtchn;
+
+	if (WARN_ON_ONCE(!cfg))
+		return;
+	BUG_ON(type_from_irq_cfg(cfg) == IRQT_VIRQ);
+	BUG_IF_IPI(cfg);
+
+	evtchn = evtchn_from_irq_cfg(cfg);
+	if (VALID_EVTCHN(evtchn))
+		notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+int multi_notify_remote_via_irq(multicall_entry_t *mcl, int irq)
+{
+	const struct irq_cfg *cfg = irq_cfg(irq);
+	unsigned int evtchn;
+
+	if (WARN_ON_ONCE(!cfg))
+		return -EINVAL;
+	BUG_ON(type_from_irq_cfg(cfg) == IRQT_VIRQ);
+	BUG_IF_IPI(cfg);
+
+	evtchn = evtchn_from_irq_cfg(cfg);
+	if (!VALID_EVTCHN(evtchn))
+		return -EINVAL;
+
+	multi_notify_remote_via_evtchn(mcl, evtchn);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(multi_notify_remote_via_irq);
+#endif
+
+int irq_to_evtchn_port(int irq)
+{
+	const struct irq_cfg *cfg = irq_cfg(irq);
+
+	if (!cfg)
+		return 0;
+	BUG_IF_VIRQ_PER_CPU(cfg);
+	BUG_IF_IPI(cfg);
+	return evtchn_from_irq_cfg(cfg);
+}
+EXPORT_SYMBOL_GPL(irq_to_evtchn_port);
+
+void mask_evtchn(int port)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, s->evtchn_mask);
+}
+EXPORT_SYMBOL_GPL(mask_evtchn);
+
+void unmask_evtchn(int port)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	unsigned int cpu = smp_processor_id();
+
+	BUG_ON(!irqs_disabled());
+
+	/* Slow path (hypercall) if this is a non-local port. */
+	if (unlikely(cpu != cpu_from_evtchn(port))) {
+		struct evtchn_unmask unmask = { .port = port };
+		VOID(HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask));
+		return;
+	}
+
+	sync_clear_bit(port, s->evtchn_mask);
+
+	/* Did we miss an interrupt 'edge'? Re-fire if so. */
+	if (sync_test_bit(port, s->evtchn_pending)) {
+		vcpu_info_t *v = current_vcpu_info();
+
+		if (!sync_test_and_set_bit(port / BITS_PER_LONG,
+					   &v->evtchn_pending_sel))
+			v->evtchn_upcall_pending = 1;
+	}
+}
+EXPORT_SYMBOL_GPL(unmask_evtchn);
+
+void disable_all_local_evtchn(void)
+{
+	unsigned i, cpu = smp_processor_id();
+	shared_info_t *s = HYPERVISOR_shared_info;
+
+	for (i = 0; i < NR_EVENT_CHANNELS; ++i)
+		if (cpu_from_evtchn(i) == cpu)
+			sync_set_bit(i, &s->evtchn_mask[0]);
+}
+
+/* Test an irq's pending state. */
+int xen_test_irq_pending(int irq)
+{
+	unsigned int evtchn = evtchn_from_irq(irq);
+
+	return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
+}
+
+#ifdef CONFIG_PM_SLEEP
+#include <linux/syscore_ops.h>
+
+static void restore_cpu_virqs(unsigned int cpu)
+{
+	struct evtchn_bind_virq bind_virq;
+	int virq, irq, evtchn;
+
+	for (virq = 0; virq < NR_VIRQS; virq++) {
+		if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
+			continue;
+
+#ifndef PER_CPU_VIRQ_IRQ
+		if (test_bit(virq, virq_per_cpu)
+		    && !VALID_EVTCHN(per_cpu(virq_to_evtchn, cpu)[virq]))
+			continue;
+#endif
+
+		BUG_ON(irq_cfg(irq)->info != mk_irq_info(IRQT_VIRQ, virq, 0));
+
+		/* Get a new binding from Xen. */
+		bind_virq.virq = virq;
+		bind_virq.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq) != 0)
+			BUG();
+		evtchn = bind_virq.port;
+
+		/* Record the new mapping. */
+		evtchn_to_irq[evtchn] = irq;
+#ifdef PER_CPU_VIRQ_IRQ
+		irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+#else
+		if (test_bit(virq, virq_per_cpu))
+			per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
+		else {
+			unsigned int cpu;
+
+			irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq,
+							 evtchn);
+			for_each_possible_cpu(cpu)
+				per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
+		}
+#endif
+		bind_evtchn_to_cpu(evtchn, cpu);
+
+		/* Ready for use. */
+		unmask_evtchn(evtchn);
+	}
+}
+
+static void restore_cpu_ipis(unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+	struct evtchn_bind_ipi bind_ipi;
+	struct irq_data *data;
+	unsigned int evtchn;
+#ifdef PER_CPU_IPI_IRQ
+	int ipi, irq;
+
+	for (ipi = 0; ipi < NR_IPIS; ipi++) {
+		if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
+			continue;
+#else
+#define ipi 0
+#define irq ipi_irq
+		if (irq == -1
+		    || !VALID_EVTCHN(per_cpu(ipi_evtchn, cpu)))
+			return;
+#endif
+
+		data = irq_get_irq_data(irq);
+		BUG_ON(irq_data_cfg(data)->info != mk_irq_info(IRQT_IPI, ipi, 0));
+
+		/* Get a new binding from Xen. */
+		bind_ipi.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+						&bind_ipi) != 0)
+			BUG();
+		evtchn = bind_ipi.port;
+
+		/* Record the new mapping. */
+		evtchn_to_irq[evtchn] = irq;
+#ifdef PER_CPU_IPI_IRQ
+		irq_data_cfg(data)->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
+#else
+		per_cpu(ipi_evtchn, cpu) = evtchn;
+#endif
+		bind_evtchn_to_cpu(evtchn, cpu);
+
+		/* Ready for use. */
+		if (!irqd_irq_disabled(data))
+			unmask_evtchn(evtchn);
+#ifdef PER_CPU_IPI_IRQ
+	}
+#else
+#undef irq
+#undef ipi
+#endif
+#endif /* CONFIG_SMP */
+}
+
+static void evtchn_resume(void)
+{
+	unsigned int cpu, irq, evtchn;
+	struct evtchn_status status;
+
+	/* Avoid doing anything in the 'suspend cancelled' case. */
+	status.dom = DOMID_SELF;
+#ifdef PER_CPU_VIRQ_IRQ
+	status.port = evtchn_from_irq(percpu_read(virq_to_irq[VIRQ_TIMER]));
+#else
+	status.port = percpu_read(virq_to_evtchn[VIRQ_TIMER]);
+#endif
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_status, &status))
+		BUG();
+	if (status.status == EVTCHNSTAT_virq
+	    && status.vcpu == smp_processor_id()
+	    && status.u.virq == VIRQ_TIMER)
+		return;
+
+	init_evtchn_cpu_bindings();
+
+	if (pirq_eoi_does_unmask) {
+		struct physdev_pirq_eoi_gmfn eoi_gmfn;
+
+		eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
+		if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn))
+			BUG();
+	}
+
+	/* New event-channel space is not 'live' yet. */
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+		mask_evtchn(evtchn);
+
+	/* No IRQ <-> event-channel mappings. */
+	for (irq = 0; irq < nr_irqs; irq++) {
+		struct irq_cfg *cfg = irq_cfg(irq);
+
+		if (!cfg)
+			continue;
+
+		/* Check that no PIRQs are still bound. */
+#ifdef CONFIG_SPARSE_IRQ
+		if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+			BUG_ON(type_from_irq_cfg(cfg) == IRQT_PIRQ);
+		else
+#endif
+			BUG_ON(cfg->info != IRQ_UNBOUND);
+
+		cfg->info &= ~((1U << _EVTCHN_BITS) - 1);
+	}
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+		evtchn_to_irq[evtchn] = -1;
+
+	for_each_possible_cpu(cpu) {
+		restore_cpu_virqs(cpu);
+		restore_cpu_ipis(cpu);
+	}
+}
+
+static struct syscore_ops evtchn_syscore_ops = {
+	.resume	= evtchn_resume,
+};
+
+static int __init evtchn_register(void)
+{
+	if (!is_initial_xendomain())
+		register_syscore_ops(&evtchn_syscore_ops);
+	return 0;
+}
+core_initcall(evtchn_register);
+#endif
+
+int __init arch_early_irq_init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(_irq_cfg); i++)
+		irq_set_chip_data(i, _irq_cfg + i);
+
+	return 0;
+}
+
+struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+{
+	int res = irq_alloc_desc_at(at, node);
+	struct irq_cfg *cfg = NULL;
+
+	if (res < 0) {
+		if (res != -EEXIST)
+			return NULL;
+		cfg = irq_get_chip_data(at);
+		if (cfg)
+			return cfg;
+	}
+
+#ifdef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_SMP
+	/* By default all event channels notify CPU#0. */
+	cpumask_copy(irq_get_irq_data(at)->affinity, cpumask_of(0));
+#endif
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (cfg)
+		irq_set_chip_data(at, cfg);
+	else
+		irq_free_desc(at);
+
+	return cfg;
+#else
+	return irq_cfg(at);
+#endif
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_X86_IO_APIC
+#include <asm/io_apic.h>
+#endif
+
+int nr_pirqs = NR_PIRQS;
+EXPORT_SYMBOL_GPL(nr_pirqs);
+
+int __init arch_probe_nr_irqs(void)
+{
+	int nr = 64 + CONFIG_XEN_NR_GUEST_DEVICES, nr_irqs_gsi;
+
+	if (is_initial_xendomain()) {
+		nr_irqs_gsi = NR_IRQS_LEGACY;
+#ifdef CONFIG_X86_IO_APIC
+		nr_irqs_gsi += gsi_top;
+#endif
+#ifdef CONFIG_PCI_MSI
+		nr += max(nr_irqs_gsi * 16, nr_cpu_ids * 8);
+#endif
+	} else {
+		nr_irqs_gsi = NR_VECTORS;
+#ifdef CONFIG_PCI_MSI
+		nr += max(NR_IRQS_LEGACY * 16, nr_cpu_ids * 8);
+#endif
+	}
+
+	if (nr_pirqs > nr_irqs_gsi)
+		nr_pirqs = nr_irqs_gsi;
+	if (nr > min_t(int, NR_DYNIRQS, NR_EVENT_CHANNELS))
+		nr = min_t(int, NR_DYNIRQS, NR_EVENT_CHANNELS);
+	nr_irqs = min_t(int, nr_pirqs + nr, PAGE_SIZE * 8);
+
+	printk(KERN_DEBUG "nr_pirqs: %d\n", nr_pirqs);
+
+	return ARRAY_SIZE(_irq_cfg);
+}
+#endif
+
+#if defined(CONFIG_X86_IO_APIC)
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+	struct physdev_irq irq_op;
+
+	if (irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs)
+		return -EINVAL;
+
+	if (cfg->vector)
+		return 0;
+
+	irq_op.irq = irq;
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
+		return -ENOSPC;
+
+	cfg->vector = irq_op.vector;
+
+	return 0;
+}
+#define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
+#elif defined(CONFIG_X86)
+#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < NR_IRQS_LEGACY)
+#else
+#define identity_mapped_irq(irq) (1)
+#endif
+
+void evtchn_register_pirq(int irq)
+{
+	struct irq_cfg *cfg = irq_cfg(irq);
+
+	BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs);
+	if (identity_mapped_irq(irq) || type_from_irq_cfg(cfg) != IRQT_UNBOUND)
+		return;
+	cfg->info = mk_irq_info(IRQT_PIRQ, irq, 0);
+	irq_set_chip_and_handler_name(irq, &pirq_chip, handle_fasteoi_irq,
+				      "fasteoi");
+}
+
+#ifdef CONFIG_PCI_MSI
+int evtchn_map_pirq(int irq, int xen_pirq)
+{
+	if (irq < 0) {
+#ifdef CONFIG_SPARSE_IRQ
+		struct irq_cfg *cfg;
+
+		spin_lock(&irq_mapping_update_lock);
+		irq = find_unbound_irq(numa_node_id(), &cfg, &pirq_chip,
+				       false);
+		if (irq >= 0) {
+			BUG_ON(type_from_irq_cfg(cfg) != IRQT_UNBOUND);
+			cfg->bindcount++;
+			cfg->info = mk_irq_info(IRQT_PIRQ, xen_pirq, 0);
+		}
+		spin_unlock(&irq_mapping_update_lock);
+		if (irq < 0)
+			return irq;
+	} else if (irq >= PIRQ_BASE && irq < PIRQ_BASE + nr_pirqs) {
+		WARN_ONCE(1, "Non-MSI IRQ#%d (Xen %d)\n", irq, xen_pirq);
+		return -EINVAL;
+#else
+		static DEFINE_SPINLOCK(irq_alloc_lock);
+
+		irq = PIRQ_BASE + nr_pirqs - 1;
+		spin_lock(&irq_alloc_lock);
+		do {
+			struct irq_cfg *cfg;
+
+			if (identity_mapped_irq(irq))
+				continue;
+			cfg = alloc_irq_and_cfg_at(irq, numa_node_id());
+			if (unlikely(!cfg)) {
+				spin_unlock(&irq_alloc_lock);
+				return -ENOMEM;
+			}
+			if (!index_from_irq_cfg(cfg)) {
+				BUG_ON(type_from_irq_cfg(cfg) != IRQT_UNBOUND);
+				cfg->info = mk_irq_info(IRQT_PIRQ,
+							xen_pirq, 0);
+				break;
+			}
+		} while (--irq >= PIRQ_BASE);
+		spin_unlock(&irq_alloc_lock);
+		if (irq < PIRQ_BASE)
+			return -ENOSPC;
+		irq_set_chip_and_handler_name(irq, &pirq_chip,
+					      handle_fasteoi_irq, "fasteoi");
+#endif
+	} else if (!xen_pirq) {
+		struct irq_cfg *cfg = irq_cfg(irq);
+
+		if (!cfg || unlikely(type_from_irq_cfg(cfg) != IRQT_PIRQ))
+			return -EINVAL;
+		/*
+		 * dynamic_irq_cleanup(irq) would seem to be the correct thing
+		 * here, but cannot be used as we get here also during shutdown
+		 * when a driver didn't free_irq() its MSI(-X) IRQ(s), which
+		 * then causes a warning in dynamic_irq_cleanup().
+		 */
+		irq_set_chip_and_handler(irq, NULL, NULL);
+		cfg->info = IRQ_UNBOUND;
+#ifdef CONFIG_SPARSE_IRQ
+		cfg->bindcount--;
+#endif
+		return 0;
+	} else if (type_from_irq(irq) != IRQT_PIRQ
+		   || index_from_irq(irq) != xen_pirq) {
+		pr_err("IRQ#%d is already mapped to %d:%u - "
+		       "cannot map to PIRQ#%u\n",
+		       irq, type_from_irq(irq), index_from_irq(irq), xen_pirq);
+		return -EINVAL;
+	}
+	return index_from_irq(irq) ? irq : -EINVAL;
+}
+#endif
+
+int evtchn_get_xen_pirq(int irq)
+{
+	struct irq_cfg *cfg = irq_cfg(irq);
+
+	if (identity_mapped_irq(irq))
+		return irq;
+	BUG_ON(type_from_irq_cfg(cfg) != IRQT_PIRQ);
+	return index_from_irq_cfg(cfg);
+}
+
+void __init xen_init_IRQ(void)
+{
+	unsigned int i;
+	struct physdev_pirq_eoi_gmfn eoi_gmfn;
+
+#ifndef PER_CPU_VIRQ_IRQ
+	__set_bit(VIRQ_TIMER, virq_per_cpu);
+	__set_bit(VIRQ_DEBUG, virq_per_cpu);
+	__set_bit(VIRQ_XENOPROF, virq_per_cpu);
+#ifdef CONFIG_IA64
+	__set_bit(VIRQ_ITC, virq_per_cpu);
+#endif
+#endif
+
+	init_evtchn_cpu_bindings();
+
+#ifdef CONFIG_SPARSE_IRQ
+	i = nr_irqs;
+#else
+	i = nr_pirqs;
+#endif
+	i = get_order(sizeof(unsigned long) * BITS_TO_LONGS(i));
+	pirq_needs_eoi = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, i);
+	BUILD_BUG_ON(NR_PIRQS > PAGE_SIZE * 8);
+ 	eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
+	if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
+		pirq_eoi_does_unmask = true;
+
+	/* No event channels are 'live' right now. */
+	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+		mask_evtchn(i);
+
+#ifndef CONFIG_SPARSE_IRQ
+	for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) {
+		irq_set_noprobe(i);
+		irq_set_chip_and_handler_name(i, &dynirq_chip,
+					      handle_fasteoi_irq, "fasteoi");
+	}
+
+	for (i = PIRQ_BASE; i < (PIRQ_BASE + nr_pirqs); i++) {
+#else
+	for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_IRQS_LEGACY); i++) {
+#endif
+		if (!identity_mapped_irq(i))
+			continue;
+
+#ifdef RTC_IRQ
+		/* If not domain 0, force our RTC driver to fail its probe. */
+		if (i - PIRQ_BASE == RTC_IRQ && !is_initial_xendomain())
+			continue;
+#endif
+
+		irq_set_chip_and_handler_name(i, &pirq_chip,
+					      handle_fasteoi_irq, "fasteoi");
+	}
+}
diff --git a/drivers/xen/core/firmware.c b/drivers/xen/core/firmware.c
new file mode 100644
index 0000000..2f851ee
--- /dev/null
+++ b/drivers/xen/core/firmware.c
@@ -0,0 +1,75 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <video/edid.h>
+#include <xen/interface/platform.h>
+#include <asm/hypervisor.h>
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+void __init copy_edd(void)
+{
+	int ret;
+	struct xen_platform_op op;
+
+	if (!is_initial_xendomain())
+		return;
+
+	op.cmd = XENPF_firmware_info;
+
+	op.u.firmware_info.type = XEN_FW_DISK_INFO;
+	for (op.u.firmware_info.index = 0;
+	     edd.edd_info_nr < EDDMAXNR;
+	     op.u.firmware_info.index++) {
+		struct edd_info *info = edd.edd_info + edd.edd_info_nr;
+
+		info->params.length = sizeof(info->params);
+		set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
+				     &info->params);
+		ret = HYPERVISOR_platform_op(&op);
+		if (ret)
+			break;
+
+#define C(x) info->x = op.u.firmware_info.u.disk_info.x
+		C(device);
+		C(version);
+		C(interface_support);
+		C(legacy_max_cylinder);
+		C(legacy_max_head);
+		C(legacy_sectors_per_track);
+#undef C
+
+		edd.edd_info_nr++;
+	}
+
+	op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
+	for (op.u.firmware_info.index = 0;
+	     edd.mbr_signature_nr < EDD_MBR_SIG_MAX;
+	     op.u.firmware_info.index++) {
+		ret = HYPERVISOR_platform_op(&op);
+		if (ret)
+			break;
+		edd.mbr_signature[edd.mbr_signature_nr++] =
+			op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
+	}
+}
+#endif
+
+void __init copy_edid(void)
+{
+#if defined(CONFIG_FIRMWARE_EDID) && defined(CONFIG_X86)
+	struct xen_platform_op op;
+
+	if (!is_initial_xendomain())
+		return;
+
+	op.cmd = XENPF_firmware_info;
+	op.u.firmware_info.index = 0;
+	op.u.firmware_info.type = XEN_FW_VBEDDC_INFO;
+	set_xen_guest_handle(op.u.firmware_info.u.vbeddc_info.edid,
+			     edid_info.dummy);
+	if (HYPERVISOR_platform_op(&op) != 0)
+		memset(edid_info.dummy, 0x13, sizeof(edid_info.dummy));
+#endif
+}
diff --git a/drivers/xen/core/gnttab.c b/drivers/xen/core/gnttab.c
new file mode 100644
index 0000000..23e470b
--- /dev/null
+++ b/drivers/xen/core/gnttab.c
@@ -0,0 +1,890 @@
+/******************************************************************************
+ * gnttab.c
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/seqlock.h>
+#include <xen/interface/xen.h>
+#include <xen/gnttab.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+#include <xen/interface/memory.h>
+#include <asm/gnttab_dma.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+/* External tools reserve first few grant table entries. */
+#define NR_RESERVED_ENTRIES 8
+#define GNTTAB_LIST_END 0xffffffff
+#define ENTRIES_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t))
+
+static grant_ref_t **gnttab_list;
+static unsigned int nr_grant_frames;
+static unsigned int boot_max_nr_grant_frames;
+static int gnttab_free_count;
+static grant_ref_t gnttab_free_head;
+static DEFINE_SPINLOCK(gnttab_list_lock);
+
+static struct grant_entry *shared;
+
+static struct gnttab_free_callback *gnttab_free_callback_list;
+
+static int gnttab_expand(unsigned int req_entries);
+
+#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+#define gnttab_entry(entry) (gnttab_list[(entry) / RPP][(entry) % RPP])
+
+#define nr_freelist_frames(grant_frames)				\
+	(((grant_frames) * ENTRIES_PER_GRANT_FRAME + RPP - 1) / RPP)
+
+static int get_free_entries(int count)
+{
+	unsigned long flags;
+	int ref, rc;
+	grant_ref_t head;
+
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+
+	if ((gnttab_free_count < count) &&
+	    ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
+		spin_unlock_irqrestore(&gnttab_list_lock, flags);
+		return rc;
+	}
+
+	ref = head = gnttab_free_head;
+	gnttab_free_count -= count;
+	while (count-- > 1)
+		head = gnttab_entry(head);
+ 	gnttab_free_head = gnttab_entry(head);
+	gnttab_entry(head) = GNTTAB_LIST_END;
+
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+
+	return ref;
+}
+
+#define get_free_entry() get_free_entries(1)
+
+static void do_free_callbacks(void)
+{
+	struct gnttab_free_callback *callback, *next;
+
+	callback = gnttab_free_callback_list;
+	gnttab_free_callback_list = NULL;
+
+	while (callback != NULL) {
+		next = callback->next;
+		if (gnttab_free_count >= callback->count) {
+			callback->next = NULL;
+			callback->queued = 0;
+			callback->fn(callback->arg);
+		} else {
+			callback->next = gnttab_free_callback_list;
+			gnttab_free_callback_list = callback;
+		}
+		callback = next;
+	}
+}
+
+static inline void check_free_callbacks(void)
+{
+	if (unlikely(gnttab_free_callback_list))
+		do_free_callbacks();
+}
+
+static void put_free_entry(grant_ref_t ref)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	gnttab_entry(ref) = gnttab_free_head;
+	gnttab_free_head = ref;
+	gnttab_free_count++;
+	check_free_callbacks();
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+/*
+ * Public grant-issuing interface functions
+ */
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+				int flags)
+{
+	int ref;
+
+	if (unlikely((ref = get_free_entry()) < 0))
+		return -ENOSPC;
+
+	shared[ref].frame = frame;
+	shared[ref].domid = domid;
+	wmb();
+	BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing));
+	shared[ref].flags = GTF_permit_access | flags;
+
+	return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
+
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+				     unsigned long frame, int flags)
+{
+	shared[ref].frame = frame;
+	shared[ref].domid = domid;
+	wmb();
+	BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing));
+	shared[ref].flags = GTF_permit_access | flags;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
+
+
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+	u16 nflags;
+
+	nflags = shared[ref].flags;
+
+	return (nflags & (GTF_reading|GTF_writing));
+}
+EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref)
+{
+	u16 flags, nflags;
+
+	nflags = shared[ref].flags;
+	do {
+		if ((flags = nflags) & (GTF_reading|GTF_writing)) {
+			printk(KERN_DEBUG "WARNING: g.e. still in use!\n");
+			return 0;
+		}
+	} while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) !=
+		 flags);
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
+
+void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page)
+{
+	if (gnttab_end_foreign_access_ref(ref)) {
+		put_free_entry(ref);
+		if (page != 0)
+			free_page(page);
+	} else {
+		/* XXX This needs to be fixed so that the ref and page are
+		   placed on a list to be freed up later. */
+		printk(KERN_DEBUG
+		       "WARNING: leaking g.e. and page still in use!\n");
+	}
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
+{
+	int ref;
+
+	if (unlikely((ref = get_free_entry()) < 0))
+		return -ENOSPC;
+	gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
+
+	return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
+				       unsigned long pfn)
+{
+	shared[ref].frame = pfn;
+	shared[ref].domid = domid;
+	wmb();
+	shared[ref].flags = GTF_accept_transfer;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+	unsigned long frame;
+	u16           flags;
+
+	/*
+	 * If a transfer is not even yet started, try to reclaim the grant
+	 * reference and return failure (== 0).
+	 */
+	while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
+		if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
+			return 0;
+		cpu_relax();
+	}
+
+	/* If a transfer is in progress then wait until it is completed. */
+	while (!(flags & GTF_transfer_completed)) {
+		flags = shared[ref].flags;
+		cpu_relax();
+	}
+
+	/* Read the frame number /after/ reading completion status. */
+	rmb();
+	frame = shared[ref].frame;
+	BUG_ON(frame == 0);
+
+	return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
+
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
+{
+	unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
+	put_free_entry(ref);
+	return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
+
+void gnttab_free_grant_reference(grant_ref_t ref)
+{
+	put_free_entry(ref);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
+
+void gnttab_free_grant_references(grant_ref_t head)
+{
+	grant_ref_t ref;
+	unsigned long flags;
+	int count = 1;
+	if (head == GNTTAB_LIST_END)
+		return;
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	ref = head;
+	while (gnttab_entry(ref) != GNTTAB_LIST_END) {
+		ref = gnttab_entry(ref);
+		count++;
+	}
+	gnttab_entry(ref) = gnttab_free_head;
+	gnttab_free_head = head;
+	gnttab_free_count += count;
+	check_free_callbacks();
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
+
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
+{
+	int h = get_free_entries(count);
+
+	if (h < 0)
+		return -ENOSPC;
+
+	*head = h;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
+
+int gnttab_empty_grant_references(const grant_ref_t *private_head)
+{
+	return (*private_head == GNTTAB_LIST_END);
+}
+EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
+
+int gnttab_claim_grant_reference(grant_ref_t *private_head)
+{
+	grant_ref_t g = *private_head;
+	if (unlikely(g == GNTTAB_LIST_END))
+		return -ENOSPC;
+	*private_head = gnttab_entry(g);
+	return g;
+}
+EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+				    grant_ref_t release)
+{
+	gnttab_entry(release) = *private_head;
+	*private_head = release;
+}
+EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+				  void (*fn)(void *), void *arg, u16 count)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	if (callback->queued)
+		goto out;
+	callback->fn = fn;
+	callback->arg = arg;
+	callback->count = count;
+	callback->queued = 1;
+	callback->next = gnttab_free_callback_list;
+	gnttab_free_callback_list = callback;
+	check_free_callbacks();
+out:
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
+
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
+{
+	struct gnttab_free_callback **pcb;
+	unsigned long flags;
+
+	spin_lock_irqsave(&gnttab_list_lock, flags);
+	for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
+		if (*pcb == callback) {
+			*pcb = callback->next;
+			callback->queued = 0;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
+
+static int grow_gnttab_list(unsigned int more_frames)
+{
+	unsigned int new_nr_grant_frames, extra_entries, i;
+	unsigned int nr_glist_frames, new_nr_glist_frames;
+
+	new_nr_grant_frames = nr_grant_frames + more_frames;
+	extra_entries       = more_frames * ENTRIES_PER_GRANT_FRAME;
+
+	nr_glist_frames = nr_freelist_frames(nr_grant_frames);
+	new_nr_glist_frames = nr_freelist_frames(new_nr_grant_frames);
+	for (i = nr_glist_frames; i < new_nr_glist_frames; i++) {
+		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
+		if (!gnttab_list[i])
+			goto grow_nomem;
+	}
+
+	for (i = ENTRIES_PER_GRANT_FRAME * nr_grant_frames;
+	     i < ENTRIES_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
+		gnttab_entry(i) = i + 1;
+
+	gnttab_entry(i) = gnttab_free_head;
+	gnttab_free_head = ENTRIES_PER_GRANT_FRAME * nr_grant_frames;
+	gnttab_free_count += extra_entries;
+
+	nr_grant_frames = new_nr_grant_frames;
+
+	check_free_callbacks();
+
+	return 0;
+	
+grow_nomem:
+	for ( ; i >= nr_glist_frames; i--)
+		free_page((unsigned long) gnttab_list[i]);
+	return -ENOMEM;
+}
+
+static unsigned int __max_nr_grant_frames(void)
+{
+	struct gnttab_query_size query;
+	int rc;
+
+	query.dom = DOMID_SELF;
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
+	if ((rc < 0) || (query.status != GNTST_okay))
+		return 4; /* Legacy max supported number of frames */
+
+	return query.max_nr_frames;
+}
+
+static inline unsigned int max_nr_grant_frames(void)
+{
+	unsigned int xen_max = __max_nr_grant_frames();
+
+	if (xen_max > boot_max_nr_grant_frames)
+		return boot_max_nr_grant_frames;
+	return xen_max;
+}
+
+#ifdef CONFIG_XEN
+
+#ifdef CONFIG_X86
+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
+		      unsigned long addr, void *data)
+{
+	unsigned long **frames = (unsigned long **)data;
+
+	set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
+	(*frames)++;
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
+			unsigned long addr, void *data)
+{
+
+	set_pte_at(&init_mm, addr, pte, __pte(0));
+	return 0;
+}
+#endif
+
+void *arch_gnttab_alloc_shared(unsigned long *frames)
+{
+	struct vm_struct *area;
+	area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames(), NULL);
+	BUG_ON(area == NULL);
+	return area->addr;
+}
+#endif /* CONFIG_X86 */
+
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+	struct gnttab_setup_table setup;
+	unsigned long *frames;
+	unsigned int nr_gframes = end_idx + 1;
+	int rc;
+
+	frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
+	if (!frames)
+		return -ENOMEM;
+
+	setup.dom        = DOMID_SELF;
+	setup.nr_frames  = nr_gframes;
+	set_xen_guest_handle(setup.frame_list, frames);
+
+	rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
+	if (rc == -ENOSYS) {
+		kfree(frames);
+		return -ENOSYS;
+	}
+
+	BUG_ON(rc || setup.status != GNTST_okay);
+
+	if (shared == NULL)
+		shared = arch_gnttab_alloc_shared(frames);
+
+#ifdef CONFIG_X86
+	rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+				 PAGE_SIZE * nr_gframes,
+				 map_pte_fn, &frames);
+	BUG_ON(rc);
+	frames -= nr_gframes; /* adjust after map_pte_fn() */
+#endif /* CONFIG_X86 */
+
+	kfree(frames);
+
+	return 0;
+}
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+
+static DEFINE_SEQLOCK(gnttab_dma_lock);
+
+static void gnttab_page_free(struct page *page, unsigned int order)
+{
+	BUG_ON(order);
+	ClearPageForeign(page);
+	gnttab_reset_grant_page(page);
+	ClearPageReserved(page);
+	put_page(page);
+}
+
+/*
+ * Must not be called with IRQs off.  This should only be used on the
+ * slow path.
+ *
+ * Copy a foreign granted page to local memory.
+ */
+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
+{
+	struct gnttab_unmap_and_replace unmap;
+	mmu_update_t mmu;
+	struct page *page;
+	struct page *new_page;
+	void *new_addr;
+	void *addr;
+	paddr_t pfn;
+	maddr_t mfn;
+	maddr_t new_mfn;
+	int err;
+
+	page = *pagep;
+	if (!get_page_unless_zero(page))
+		return -ENOENT;
+
+	err = -ENOMEM;
+	new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+	if (!new_page)
+		goto out;
+
+	new_addr = page_address(new_page);
+	addr = page_address(page);
+	copy_page(new_addr, addr);
+
+	pfn = page_to_pfn(page);
+	mfn = pfn_to_mfn(pfn);
+	new_mfn = virt_to_mfn(new_addr);
+
+	write_seqlock_bh(&gnttab_dma_lock);
+
+	/* Make seq visible before checking page_mapped. */
+	smp_mb();
+
+	/* Has the page been DMA-mapped? */
+	if (unlikely(page_mapped(page))) {
+		write_sequnlock_bh(&gnttab_dma_lock);
+		put_page(new_page);
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		set_phys_to_machine(pfn, new_mfn);
+
+	gnttab_set_replace_op(&unmap, (unsigned long)addr,
+			      (unsigned long)new_addr, ref);
+
+	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
+					&unmap, 1);
+	BUG_ON(err);
+	BUG_ON(unmap.status != GNTST_okay);
+
+	write_sequnlock_bh(&gnttab_dma_lock);
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
+
+		mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+		mmu.val = pfn;
+		err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
+		BUG_ON(err);
+	}
+
+	new_page->mapping = page->mapping;
+	new_page->index = page->index;
+	set_bit(PG_foreign, &new_page->flags);
+	if (PageReserved(page))
+		SetPageReserved(new_page);
+	*pagep = new_page;
+
+	SetPageForeign(page, gnttab_page_free);
+	page->mapping = NULL;
+
+out:
+	put_page(page);
+	return err;
+}
+EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
+
+void gnttab_reset_grant_page(struct page *page)
+{
+	init_page_count(page);
+	reset_page_mapcount(page);
+}
+EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
+
+/*
+ * Keep track of foreign pages marked as PageForeign so that we don't
+ * return them to the remote domain prematurely.
+ *
+ * PageForeign pages are pinned down by increasing their mapcount.
+ *
+ * All other pages are simply returned as is.
+ */
+void __gnttab_dma_map_page(struct page *page)
+{
+	unsigned int seq;
+
+	if (!is_running_on_xen() || !PageForeign(page))
+		return;
+
+	do {
+		seq = read_seqbegin(&gnttab_dma_lock);
+
+		if (gnttab_dma_local_pfn(page))
+			break;
+
+		atomic_set(&page->_mapcount, 0);
+
+		/* Make _mapcount visible before read_seqretry. */
+		smp_mb();
+	} while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
+}
+
+#endif /* CONFIG_XEN_BACKEND */
+
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+
+static unsigned int GNTMAP_pte_special;
+
+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
+			   unsigned int count)
+{
+	unsigned int i;
+
+	if (unlikely(cmd != GNTTABOP_map_grant_ref))
+		count = 0;
+
+	for (i = 0; i < count; ++i, ++map) {
+		if (!(map->flags & GNTMAP_host_map)
+		    || !(map->flags & GNTMAP_application_map))
+			continue;
+		if (GNTMAP_pte_special)
+			map->flags |= GNTMAP_pte_special;
+		else {
+			BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+			return true;
+		}
+	}
+
+	return false;
+}
+EXPORT_SYMBOL(gnttab_pre_map_adjust);
+
+#if CONFIG_XEN_COMPAT < 0x030400
+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
+{
+	unsigned int i;
+	int rc = 0;
+
+	for (i = 0; i < count && rc == 0; ++i, ++map) {
+		pte_t pte;
+
+		if (!(map->flags & GNTMAP_host_map)
+		    || !(map->flags & GNTMAP_application_map))
+			continue;
+
+#ifdef CONFIG_X86
+		pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
+				| _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
+				| _PAGE_SPECIAL)
+			       & __supported_pte_mask);
+#else
+#error Architecture not yet supported.
+#endif
+		if (!(map->flags & GNTMAP_readonly))
+			pte = pte_mkwrite(pte);
+
+		if (map->flags & GNTMAP_contains_pte) {
+			mmu_update_t u;
+
+			u.ptr = map->host_addr;
+			u.val = __pte_val(pte);
+			rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+		} else
+			rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(gnttab_post_map_adjust);
+#endif
+
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
+
+int gnttab_resume(void)
+{
+	if (max_nr_grant_frames() < nr_grant_frames)
+		return 0;
+	return gnttab_map(0, nr_grant_frames - 1);
+}
+
+#ifdef CONFIG_PM_SLEEP
+#include <linux/syscore_ops.h>
+
+#ifdef CONFIG_X86
+static int gnttab_suspend(void)
+{
+	apply_to_page_range(&init_mm, (unsigned long)shared,
+			    PAGE_SIZE * nr_grant_frames,
+			    unmap_pte_fn, NULL);
+	return 0;
+}
+#else
+#define gnttab_suspend NULL
+#endif
+
+static void _gnttab_resume(void)
+{
+	if (gnttab_resume())
+		BUG();
+}
+
+static struct syscore_ops gnttab_syscore_ops = {
+	.resume		= _gnttab_resume,
+	.suspend	= gnttab_suspend,
+};
+#endif
+
+#else /* !CONFIG_XEN */
+
+#include <platform-pci.h>
+
+static unsigned long resume_frames;
+
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+	struct xen_add_to_physmap xatp;
+	unsigned int i = end_idx;
+
+	/* Loop backwards, so that the first hypercall has the largest index,
+	 * ensuring that the table will grow only once.
+	 */
+	do {
+		xatp.domid = DOMID_SELF;
+		xatp.idx = i;
+		xatp.space = XENMAPSPACE_grant_table;
+		xatp.gpfn = (resume_frames >> PAGE_SHIFT) + i;
+		if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+			BUG();
+	} while (i-- > start_idx);
+
+	return 0;
+}
+
+int gnttab_resume(void)
+{
+	unsigned int max_nr_gframes, nr_gframes;
+
+	nr_gframes = nr_grant_frames;
+	max_nr_gframes = max_nr_grant_frames();
+	if (max_nr_gframes < nr_gframes)
+		return -ENOSYS;
+
+	if (!resume_frames) {
+		resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+		shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes);
+		if (shared == NULL) {
+			pr_warning("error to ioremap gnttab share frames\n");
+			return -1;
+		}
+	}
+
+	gnttab_map(0, nr_gframes - 1);
+
+	return 0;
+}
+
+#endif /* !CONFIG_XEN */
+
+static int gnttab_expand(unsigned int req_entries)
+{
+	int rc;
+	unsigned int cur, extra;
+
+	cur = nr_grant_frames;
+	extra = ((req_entries + (ENTRIES_PER_GRANT_FRAME-1)) /
+		 ENTRIES_PER_GRANT_FRAME);
+	if (cur + extra > max_nr_grant_frames())
+		return -ENOSPC;
+
+	if ((rc = gnttab_map(cur, cur + extra - 1)) == 0)
+		rc = grow_gnttab_list(extra);
+
+	return rc;
+}
+
+#ifdef CONFIG_XEN
+static int __init
+#else
+int __devinit
+#endif
+gnttab_init(void)
+{
+	int i;
+	unsigned int max_nr_glist_frames, nr_glist_frames;
+	unsigned int nr_init_grefs;
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	nr_grant_frames = 1;
+	boot_max_nr_grant_frames = __max_nr_grant_frames();
+
+	/* Determine the maximum number of frames required for the
+	 * grant reference free list on the current hypervisor.
+	 */
+	max_nr_glist_frames = nr_freelist_frames(boot_max_nr_grant_frames);
+
+	gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
+			      GFP_KERNEL);
+	if (gnttab_list == NULL)
+		return -ENOMEM;
+
+	nr_glist_frames = nr_freelist_frames(nr_grant_frames);
+	for (i = 0; i < nr_glist_frames; i++) {
+		gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
+		if (gnttab_list[i] == NULL)
+			goto ini_nomem;
+	}
+
+	if (gnttab_resume() < 0)
+		return -ENODEV;
+
+	nr_init_grefs = nr_grant_frames * ENTRIES_PER_GRANT_FRAME;
+
+	for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
+		gnttab_entry(i) = i + 1;
+
+	gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
+	gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
+	gnttab_free_head  = NR_RESERVED_ENTRIES;
+
+#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
+	if (!xen_feature(XENFEAT_auto_translated_physmap)
+	    && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
+#ifdef CONFIG_X86
+		GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
+				      >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
+#else
+#error Architecture not yet supported.
+#endif
+	}
+#endif
+
+#if defined(CONFIG_XEN) && defined(CONFIG_PM_SLEEP)
+	if (!is_initial_xendomain())
+		register_syscore_ops(&gnttab_syscore_ops);
+#endif
+
+	return 0;
+
+ ini_nomem:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)gnttab_list[i]);
+	kfree(gnttab_list);
+	return -ENOMEM;
+}
+
+#ifdef CONFIG_XEN
+core_initcall(gnttab_init);
+#endif
diff --git a/drivers/xen/core/machine_kexec.c b/drivers/xen/core/machine_kexec.c
new file mode 100644
index 0000000..f053d28
--- /dev/null
+++ b/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,403 @@
+/*
+ * drivers/xen/core/machine_kexec.c 
+ * handle transition of Linux booting another kernel
+ */
+
+#include <linux/kexec.h>
+#include <linux/slab.h>
+#include <xen/interface/kexec.h>
+#include <xen/interface/platform.h>
+#include <linux/reboot.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <xen/pcpu.h>
+
+extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, 
+					 struct kimage *image);
+extern int machine_kexec_setup_resources(struct resource *hypervisor,
+					 struct resource *phys_cpus,
+					 int nr_phys_cpus);
+extern int machine_kexec_setup_resource(struct resource *hypervisor,
+					struct resource *phys_cpu);
+extern void machine_kexec_register_resources(struct resource *res);
+
+static unsigned int xen_nr_phys_cpus, xen_max_nr_phys_cpus;
+static struct resource xen_hypervisor_res;
+static struct resource *xen_phys_cpus;
+static struct xen_phys_cpu_entry {
+	struct xen_phys_cpu_entry *next;
+	struct resource res;
+} *xen_phys_cpu_list;
+
+size_t vmcoreinfo_size_xen;
+unsigned long paddr_vmcoreinfo_xen;
+
+static int fill_crash_res(struct resource *res, unsigned int cpu)
+{
+	xen_kexec_range_t range = {
+		.range = KEXEC_RANGE_MA_CPU,
+		.nr = cpu
+	};
+	int rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range);
+
+	if (!rc && !range.size)
+		rc = -ENODEV;
+	if (!rc) {
+		res->name = "Crash note";
+		res->start = range.start;
+		res->end = range.start + range.size - 1;
+		res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+	}
+
+	return rc;
+}
+
+static struct resource *find_crash_res(const struct resource *r,
+				       unsigned int *idx)
+{
+	unsigned int i;
+	struct xen_phys_cpu_entry *ent;
+
+	for (i = 0; i < xen_max_nr_phys_cpus; ++i) {
+		struct resource *res = xen_phys_cpus + i;
+
+		if (res->parent && res->start == r->start
+		    && res->end == r->end) {
+			if (idx)
+				*idx = i;
+			return res;
+		}
+	}
+
+	for (ent = xen_phys_cpu_list; ent; ent = ent->next, ++i)
+		if (ent->res.parent && ent->res.start == r->start
+		    && ent->res.end == r->end) {
+			if (idx)
+				*idx = i;
+			return &ent->res;
+		}
+
+	return NULL;
+}
+
+static int kexec_cpu_callback(struct notifier_block *nfb,
+			      unsigned long action, void *hcpu)
+{
+	unsigned int i, cpu = (unsigned long)hcpu;
+	struct xen_phys_cpu_entry *ent;
+	struct resource *res = NULL, r;
+
+	if (xen_nr_phys_cpus < xen_max_nr_phys_cpus)
+		xen_nr_phys_cpus = xen_max_nr_phys_cpus;
+	switch (action) {
+	case CPU_ONLINE:
+		for (i = 0; i < xen_max_nr_phys_cpus; ++i)
+			if (!xen_phys_cpus[i].parent) {
+				res = xen_phys_cpus + i;
+				break;
+			}
+		if (!res)
+			for (ent = xen_phys_cpu_list; ent; ent = ent->next)
+				if (!ent->res.parent) {
+					res = &ent->res;
+					break;
+				}
+		if (!res) {
+			ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+			res = ent ? &ent->res : NULL;
+		} else
+			ent = NULL;
+		if (res && !fill_crash_res(res, cpu)
+		    && !machine_kexec_setup_resource(&xen_hypervisor_res,
+						     res)) {
+			if (ent) {
+				ent->next = xen_phys_cpu_list;
+				xen_phys_cpu_list = ent;
+				++xen_nr_phys_cpus;
+			}
+		} else {
+			pr_warn("Could not set up crash note for pCPU#%u\n",
+				cpu);
+			kfree(ent);
+		}
+		break;
+
+	case CPU_DEAD:
+		if (!fill_crash_res(&r, cpu))
+			res = find_crash_res(&r, NULL);
+		if (!res) {
+			unsigned long *map;
+			xen_platform_op_t op;
+
+			map = kcalloc(BITS_TO_LONGS(xen_nr_phys_cpus),
+				      sizeof(long), GFP_KERNEL);
+			if (!map)
+				break;
+
+			op.cmd = XENPF_get_cpuinfo;
+			op.u.pcpu_info.xen_cpuid = 0;
+			if (HYPERVISOR_platform_op(&op) == 0)
+				i = op.u.pcpu_info.max_present + 1;
+			else
+				i = xen_nr_phys_cpus;
+
+			for (cpu = 0; cpu < i; ++cpu) {
+				unsigned int idx;
+
+				if (fill_crash_res(&r, cpu))
+					continue;
+				if (find_crash_res(&r, &idx)) {
+					BUG_ON(idx >= xen_nr_phys_cpus);
+					__set_bit(idx, map);
+				}
+			}
+
+			for (i = 0; i < xen_max_nr_phys_cpus; ++i)
+				if (xen_phys_cpus[i].parent && !test_bit(i, map)) {
+					res = xen_phys_cpus + i;
+					break;
+				}
+			for (ent = xen_phys_cpu_list; !res && ent;
+			     ent = ent->next, ++i)
+				if (ent->res.parent && !test_bit(i, map))
+					res = &ent->res;
+			kfree(map);
+		}
+		if (res)
+			release_resource(res);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kexec_cpu_notifier = {
+	.notifier_call = kexec_cpu_callback
+};
+
+void __init xen_machine_kexec_setup_resources(void)
+{
+	xen_kexec_range_t range;
+	xen_platform_op_t op;
+	unsigned int k = 0, nr = 0;
+	int rc;
+
+	if (strstr(boot_command_line, "crashkernel="))
+		pr_warning("Ignoring crashkernel command line, "
+			   "parameter will be supplied by xen\n");
+
+	if (!is_initial_xendomain())
+		return;
+
+	/* fill in crashk_res if range is reserved by hypervisor */
+	memset(&range, 0, sizeof(range));
+	range.range = KEXEC_RANGE_MA_CRASH;
+
+	if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)
+	    || !range.size)
+		return;
+
+	crashk_res.start = range.start;
+	crashk_res.end = range.start + range.size - 1;
+
+	/* determine maximum number of physical cpus */
+	op.cmd = XENPF_get_cpuinfo;
+	op.u.pcpu_info.xen_cpuid = 0;
+	if (HYPERVISOR_platform_op(&op) == 0)
+		k = op.u.pcpu_info.max_present + 1;
+#if CONFIG_XEN_COMPAT < 0x040000
+	else while (1) {
+		memset(&range, 0, sizeof(range));
+		range.range = KEXEC_RANGE_MA_CPU;
+		range.nr = k;
+
+		if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+			break;
+
+		k++;
+	}
+#endif
+
+	if (k == 0)
+		return;
+
+	xen_max_nr_phys_cpus = k;
+
+	/* allocate xen_phys_cpus */
+
+	xen_phys_cpus = alloc_bootmem(k * sizeof(struct resource));
+
+	/* fill in xen_phys_cpus with per-cpu crash note information */
+
+	for (k = 0; k < xen_max_nr_phys_cpus; k++)
+		if (!fill_crash_res(xen_phys_cpus + nr, k))
+			++nr;
+
+	if (nr == 0)
+		goto free;
+
+	/* fill in xen_hypervisor_res with hypervisor machine address range */
+
+	memset(&range, 0, sizeof(range));
+	range.range = KEXEC_RANGE_MA_XEN;
+
+	if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+		goto free;
+
+	xen_hypervisor_res.name = "Hypervisor code and data";
+	xen_hypervisor_res.start = range.start;
+	xen_hypervisor_res.end = range.start + range.size - 1;
+	xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+
+	/* get physical address of vmcoreinfo */
+	memset(&range, 0, sizeof(range));
+	range.range = KEXEC_RANGE_MA_VMCOREINFO;
+
+	rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range);
+
+	if (rc == 0) {
+		/* Hypercall succeeded */
+		vmcoreinfo_size_xen = range.size;
+		paddr_vmcoreinfo_xen = range.start;
+
+	} else {
+		/* Hypercall failed.
+		 * Indicate not to create sysfs file by resetting globals
+		 */
+		vmcoreinfo_size_xen = 0;
+		paddr_vmcoreinfo_xen = 0;
+		
+#if CONFIG_XEN_COMPAT < 0x030300
+		/* The KEXEC_CMD_kexec_get_range hypercall did not implement
+		 * KEXEC_RANGE_MA_VMCOREINFO until Xen 3.3.
+		 * Do not bail out if it fails for this reason.
+		 */
+		if (rc != -EINVAL)
+#endif
+			goto free;
+	}
+
+	if (machine_kexec_setup_resources(&xen_hypervisor_res, xen_phys_cpus,
+					  nr)) {
+		/*
+		 * It's too cumbersome to properly free xen_phys_cpus here.
+		 * Failure at this stage is unexpected and the amount of
+		 * memory is small therefore we tolerate the potential leak.
+		 */
+		goto err;
+	}
+
+	xen_nr_phys_cpus = nr;
+	rc = register_pcpu_notifier(&kexec_cpu_notifier);
+	if (rc)
+		pr_warn("kexec: pCPU notifier registration failed (%d)\n", rc);
+
+	return;
+
+ free:
+	free_bootmem(__pa(xen_phys_cpus),
+		     xen_max_nr_phys_cpus * sizeof(*xen_phys_cpus));
+ err:
+	xen_nr_phys_cpus = 0;
+}
+
+#ifndef CONFIG_X86
+void __init xen_machine_kexec_register_resources(struct resource *res)
+{
+	int k;
+	struct resource *r;
+
+	request_resource(res, &xen_hypervisor_res);
+	for (k = 0; k < xen_nr_phys_cpus; k++) {
+		r = xen_phys_cpus + k;
+		if (r->parent == NULL) /* out of xen_hypervisor_res range */
+			request_resource(res, r);
+	} 
+	machine_kexec_register_resources(res);
+}
+#endif
+
+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+	machine_kexec_setup_load_arg(xki, image);
+
+	xki->indirection_page = image->head;
+	xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+	xen_kexec_load_t xkl;
+
+	memset(&xkl, 0, sizeof(xkl));
+	xkl.type = image->type;
+	setup_load_arg(&xkl.image, image);
+	return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+	xen_kexec_load_t xkl;
+
+	memset(&xkl, 0, sizeof(xkl));
+	xkl.type = image->type;
+	WARN_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl));
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+void __noreturn machine_kexec(struct kimage *image)
+{
+	xen_kexec_exec_t xke;
+
+	memset(&xke, 0, sizeof(xke));
+	xke.type = image->type;
+	VOID(HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke));
+	panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+#ifdef CONFIG_X86
+unsigned long paddr_vmcoreinfo_note(void)
+{
+	return virt_to_machine(&vmcoreinfo_note);
+}
+#endif
+
+void machine_shutdown(void)
+{
+	/* do nothing */
+}
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+	/* The kernel is broken so disable interrupts */
+	local_irq_disable();
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
diff --git a/drivers/xen/core/machine_reboot.c b/drivers/xen/core/machine_reboot.c
new file mode 100644
index 0000000..3a0b4e6
--- /dev/null
+++ b/drivers/xen/core/machine_reboot.c
@@ -0,0 +1,285 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/export.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <linux/stringify.h>
+#include <linux/stop_machine.h>
+#include <linux/syscore_ops.h>
+#include <asm/irq.h>
+#include <asm/mmu_context.h>
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <linux/cpu.h>
+#include <xen/clock.h>
+#include <xen/gnttab.h>
+#include <xen/xencons.h>
+#include <xen/cpu_hotplug.h>
+#include <xen/interface/vcpu.h>
+#include "../../base/base.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <asm/pci_x86.h>
+/* TBD: Dom0 should propagate the determined value to Xen. */
+bool port_cf9_safe = false;
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+void machine_emergency_restart(void)
+{
+	/* We really want to get pending console data out before we die. */
+	xencons_force_flush();
+	HYPERVISOR_shutdown(SHUTDOWN_reboot);
+}
+
+void machine_restart(char * __unused)
+{
+	machine_emergency_restart();
+}
+
+void machine_halt(void)
+{
+	machine_power_off();
+}
+
+void machine_power_off(void)
+{
+	/* We really want to get pending console data out before we die. */
+	xencons_force_flush();
+	if (pm_power_off)
+		pm_power_off();
+	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static void pre_suspend(void)
+{
+	HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+	WARN_ON(HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO),
+					     __pte_ma(0), 0));
+
+	xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
+	xen_start_info->console.domU.mfn =
+		mfn_to_pfn(xen_start_info->console.domU.mfn);
+}
+
+static void post_suspend(int suspend_cancelled, int fast_suspend)
+{
+	unsigned long shinfo_mfn;
+
+	if (suspend_cancelled) {
+		xen_start_info->store_mfn =
+			pfn_to_mfn(xen_start_info->store_mfn);
+		xen_start_info->console.domU.mfn =
+			pfn_to_mfn(xen_start_info->console.domU.mfn);
+	} else {
+		unsigned int i;
+
+#ifdef CONFIG_SMP
+		cpumask_copy(vcpu_initialized_mask, cpu_online_mask);
+#endif
+		for_each_possible_cpu(i) {
+			setup_runstate_area(i);
+
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+			if (fast_suspend && i != smp_processor_id()
+			    && cpu_online(i)
+			    && HYPERVISOR_vcpu_op(VCPUOP_down, i, NULL))
+				BUG();
+
+			setup_vcpu_info(i);
+
+			if (fast_suspend && i != smp_processor_id()
+			    && cpu_online(i)
+			    && HYPERVISOR_vcpu_op(VCPUOP_up, i, NULL))
+				BUG();
+#endif
+		}
+	}
+
+	shinfo_mfn = xen_start_info->shared_info >> PAGE_SHIFT;
+	if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO),
+					 pfn_pte_ma(shinfo_mfn, PAGE_KERNEL),
+					 0))
+		BUG();
+	HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+
+	clear_page(empty_zero_page);
+
+	if (!suspend_cancelled)
+		setup_pfn_to_mfn_frame_list(NULL);
+}
+#endif
+
+#else /* !(defined(__i386__) || defined(__x86_64__)) */
+
+#ifndef HAVE_XEN_PRE_SUSPEND
+#define xen_pre_suspend()	((void)0)
+#endif
+
+#ifndef HAVE_XEN_POST_SUSPEND
+#define xen_post_suspend(x)	((void)0)
+#endif
+
+#define switch_idle_mm()	((void)0)
+#define mm_pin_all()		((void)0)
+#define pre_suspend()		xen_pre_suspend()
+#define post_suspend(x, f)	xen_post_suspend(x)
+
+#endif
+
+#ifdef CONFIG_PM_SLEEP
+struct suspend {
+	int fast_suspend;
+	void (*resume_notifier)(int);
+};
+
+static int take_machine_down(void *_suspend)
+{
+	struct suspend *suspend = _suspend;
+	int suspend_cancelled;
+
+	BUG_ON(!irqs_disabled());
+
+	mm_pin_all();
+	suspend_cancelled = syscore_suspend();
+	if (!suspend_cancelled) {
+		pre_suspend();
+
+		/*
+		 * This hypercall returns 1 if suspend was cancelled or the domain was
+		 * merely checkpointed, and 0 if it is resuming in a new domain.
+		 */
+		suspend_cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+	} else
+		BUG_ON(suspend_cancelled > 0);
+	suspend->resume_notifier(suspend_cancelled);
+	if (suspend_cancelled >= 0)
+		post_suspend(suspend_cancelled, suspend->fast_suspend);
+	if (!suspend_cancelled)
+		xen_clockevents_resume();
+	if (suspend_cancelled >= 0)
+		syscore_resume();
+	if (!suspend_cancelled) {
+#ifdef __x86_64__
+		/*
+		 * Older versions of Xen do not save/restore the user %cr3.
+		 * We do it here just in case, but there's no need if we are
+		 * in fast-suspend mode as that implies a new enough Xen.
+		 */
+		if (!suspend->fast_suspend)
+			xen_new_user_pt(current->active_mm->pgd);
+#endif
+	}
+
+	return suspend_cancelled;
+}
+
+int __xen_suspend(int fast_suspend, void (*resume_notifier)(int))
+{
+	int err, suspend_cancelled;
+	const char *what;
+	struct suspend suspend;
+
+#define _check(fn, args...) ({ \
+	what = #fn; \
+	err = (fn)(args); \
+})
+
+	BUG_ON(smp_processor_id() != 0);
+	BUG_ON(in_interrupt());
+
+#if defined(__i386__) || defined(__x86_64__)
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		pr_warning("Can't suspend in auto_translated_physmap mode\n");
+		return -EOPNOTSUPP;
+	}
+#endif
+
+	/* If we are definitely UP then 'slow mode' is actually faster. */
+	if (num_possible_cpus() == 1)
+		fast_suspend = 0;
+
+	suspend.fast_suspend = fast_suspend;
+	suspend.resume_notifier = resume_notifier;
+
+	if (_check(dpm_suspend_start, PMSG_SUSPEND)) {
+		dpm_resume_end(PMSG_RESUME);
+		pr_err("%s() failed: %d\n", what, err);
+		return err;
+	}
+
+	if (fast_suspend) {
+		xenbus_suspend();
+
+		if (_check(dpm_suspend_noirq, PMSG_SUSPEND)) {
+			xenbus_suspend_cancel();
+			dpm_resume_end(PMSG_RESUME);
+			pr_err("%s() failed: %d\n", what, err);
+			return err;
+		}
+
+		err = stop_machine(take_machine_down, &suspend,
+				   &cpumask_of_cpu(0));
+		if (err < 0)
+			xenbus_suspend_cancel();
+	} else {
+		BUG_ON(irqs_disabled());
+
+		for (;;) {
+			xenbus_suspend();
+
+			if (!_check(dpm_suspend_noirq, PMSG_SUSPEND)
+			    && _check(smp_suspend))
+				dpm_resume_noirq(PMSG_RESUME);
+			if (err) {
+				xenbus_suspend_cancel();
+				dpm_resume_end(PMSG_RESUME);
+				pr_err("%s() failed: %d\n", what, err);
+				return err;
+			}
+
+			preempt_disable();
+
+			if (num_online_cpus() == 1)
+				break;
+
+			preempt_enable();
+
+			dpm_resume_noirq(PMSG_RESUME);
+
+			xenbus_suspend_cancel();
+		}
+
+		local_irq_disable();
+		err = take_machine_down(&suspend);
+		local_irq_enable();
+	}
+
+	dpm_resume_noirq(PMSG_RESUME);
+
+	if (err >= 0) {
+		suspend_cancelled = err;
+		if (!suspend_cancelled) {
+			xencons_resume();
+			xenbus_resume();
+		} else {
+			xenbus_suspend_cancel();
+			err = 0;
+		}
+
+		if (!fast_suspend)
+			smp_resume();
+	}
+
+	dpm_resume_end(PMSG_RESUME);
+
+	return err;
+}
+#endif
diff --git a/drivers/xen/core/pcpu.c b/drivers/xen/core/pcpu.c
new file mode 100644
index 0000000..4280699
--- /dev/null
+++ b/drivers/xen/core/pcpu.c
@@ -0,0 +1,447 @@
+/*
+ * pcpu.c - management physical cpu in dom0 environment
+ */
+#include <linux/acpi.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <asm/hypervisor.h>
+#include <xen/interface/platform.h>
+#include <xen/evtchn.h>
+#include <xen/pcpu.h>
+#include <acpi/processor.h>
+
+struct pcpu {
+	struct list_head pcpu_list;
+	struct device dev;
+	uint32_t apic_id;
+	uint32_t acpi_id;
+	uint32_t flags;
+};
+
+static inline int xen_pcpu_online(uint32_t flags)
+{
+	return !!(flags & XEN_PCPU_FLAGS_ONLINE);
+}
+
+static DEFINE_MUTEX(xen_pcpu_lock);
+
+/* No need for irq disable since hotplug notify is in workqueue context */
+#define get_pcpu_lock() mutex_lock(&xen_pcpu_lock);
+#define put_pcpu_lock() mutex_unlock(&xen_pcpu_lock);
+
+static LIST_HEAD(xen_pcpus);
+
+static BLOCKING_NOTIFIER_HEAD(pcpu_chain);
+
+static inline void *notifier_param(const struct pcpu *pcpu)
+{
+	return (void *)(unsigned long)pcpu->dev.id;
+}
+
+int register_pcpu_notifier(struct notifier_block *nb)
+{
+	int err;
+
+	get_pcpu_lock();
+
+	err = blocking_notifier_chain_register(&pcpu_chain, nb);
+
+	if (!err) {
+		struct pcpu *pcpu;
+
+		list_for_each_entry(pcpu, &xen_pcpus, pcpu_list)
+			if (xen_pcpu_online(pcpu->flags))
+				nb->notifier_call(nb, CPU_ONLINE,
+						  notifier_param(pcpu));
+	}
+
+	put_pcpu_lock();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_pcpu_notifier);
+
+void unregister_pcpu_notifier(struct notifier_block *nb)
+{
+	get_pcpu_lock();
+	blocking_notifier_chain_unregister(&pcpu_chain, nb);
+	put_pcpu_lock();
+}
+EXPORT_SYMBOL_GPL(unregister_pcpu_notifier);
+
+static int xen_pcpu_down(uint32_t xen_id)
+{
+	xen_platform_op_t op;
+
+	op.cmd = XENPF_cpu_offline;
+	op.u.cpu_ol.cpuid = xen_id;
+	return HYPERVISOR_platform_op(&op);
+}
+
+static int xen_pcpu_up(uint32_t xen_id)
+{
+	xen_platform_op_t op;
+
+	op.cmd = XENPF_cpu_online;
+	op.u.cpu_ol.cpuid = xen_id;
+	return HYPERVISOR_platform_op(&op);
+}
+
+static ssize_t show_online(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+	return sprintf(buf, "%d\n", xen_pcpu_online(cpu->flags));
+}
+
+static ssize_t store_online(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	ssize_t ret;
+
+	if (!count)
+		return -EINVAL;
+
+	switch (buf[0]) {
+	case '0':
+		ret = xen_pcpu_down(dev->id);
+		break;
+	case '1':
+		ret = xen_pcpu_up(dev->id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret >= 0)
+		ret = count;
+	return ret;
+}
+
+static DEVICE_ATTR(online, 0644, show_online, store_online);
+
+static ssize_t show_apicid(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+	return sprintf(buf, "%#x\n", cpu->apic_id);
+}
+static DEVICE_ATTR(apic_id, 0444, show_apicid, NULL);
+
+static ssize_t show_acpiid(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+	return sprintf(buf, "%#x\n", cpu->acpi_id);
+}
+static DEVICE_ATTR(acpi_id, 0444, show_acpiid, NULL);
+
+static struct bus_type xen_pcpu_subsys = {
+	.name = "xen_pcpu",
+	.dev_name = "xen_pcpu",
+};
+
+static int xen_pcpu_free(struct pcpu *pcpu)
+{
+	if (!pcpu)
+		return 0;
+
+	device_remove_file(&pcpu->dev, &dev_attr_online);
+	device_remove_file(&pcpu->dev, &dev_attr_apic_id);
+	device_remove_file(&pcpu->dev, &dev_attr_acpi_id);
+	device_unregister(&pcpu->dev);
+	list_del(&pcpu->pcpu_list);
+	kfree(pcpu);
+
+	return 0;
+}
+
+static inline int same_pcpu(struct xenpf_pcpuinfo *info,
+			    struct pcpu *pcpu)
+{
+	return (pcpu->apic_id == info->apic_id) &&
+		(pcpu->dev.id == info->xen_cpuid);
+}
+
+/*
+ * Return 1 if online status changed
+ */
+static int xen_pcpu_online_check(struct xenpf_pcpuinfo *info,
+				 struct pcpu *pcpu)
+{
+	int result = 0;
+
+	if (info->xen_cpuid != pcpu->dev.id)
+		return 0;
+
+	if (xen_pcpu_online(info->flags) && !xen_pcpu_online(pcpu->flags)) {
+		/* the pcpu is onlined */
+		pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
+		blocking_notifier_call_chain(&pcpu_chain, CPU_ONLINE,
+					     notifier_param(pcpu));
+		kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE);
+		result = 1;
+	} else if (!xen_pcpu_online(info->flags) &&
+		   xen_pcpu_online(pcpu->flags))  {
+		/* The pcpu is offlined now */
+		pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
+		blocking_notifier_call_chain(&pcpu_chain, CPU_DEAD,
+					     notifier_param(pcpu));
+		kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE);
+		result = 1;
+	}
+
+	return result;
+}
+
+static int pcpu_dev_init(struct pcpu *cpu)
+{
+	int err = device_register(&cpu->dev);
+
+	if (!err) {
+		device_create_file(&cpu->dev, &dev_attr_online);
+		device_create_file(&cpu->dev, &dev_attr_apic_id);
+		device_create_file(&cpu->dev, &dev_attr_acpi_id);
+	}
+	return err;
+}
+
+static struct pcpu *get_pcpu(unsigned int xen_id)
+{
+	struct pcpu *pcpu;
+
+	list_for_each_entry(pcpu, &xen_pcpus, pcpu_list)
+		if (pcpu->dev.id == xen_id)
+			return pcpu;
+
+	return NULL;
+}
+
+static struct pcpu *init_pcpu(struct xenpf_pcpuinfo *info)
+{
+	struct pcpu *pcpu;
+	int err;
+
+	if (info->flags & XEN_PCPU_FLAGS_INVALID)
+		return ERR_PTR(-EINVAL);
+
+	/* The PCPU is just added */
+	pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
+	if (!pcpu)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&pcpu->pcpu_list);
+	pcpu->apic_id = info->apic_id;
+	pcpu->acpi_id = info->acpi_id;
+	pcpu->flags = info->flags;
+
+	pcpu->dev.bus = &xen_pcpu_subsys;
+	pcpu->dev.id = info->xen_cpuid;
+
+	err = pcpu_dev_init(pcpu);
+	if (err) {
+		kfree(pcpu);
+		return ERR_PTR(err);
+	}
+
+	list_add_tail(&pcpu->pcpu_list, &xen_pcpus);
+	return pcpu;
+}
+
+#define PCPU_NO_CHANGE			0
+#define PCPU_ADDED			1
+#define PCPU_ONLINE_OFFLINE		2
+#define PCPU_REMOVED			3
+/*
+ * Caller should hold the pcpu lock
+ * < 0: Something wrong
+ * 0: No changes
+ * > 0: State changed
+ */
+static int _sync_pcpu(unsigned int cpu_num, unsigned int *max_id)
+{
+	struct pcpu *pcpu;
+	struct xenpf_pcpuinfo *info;
+	xen_platform_op_t op;
+	int ret;
+
+	op.cmd = XENPF_get_cpuinfo;
+	info = &op.u.pcpu_info;
+	info->xen_cpuid = cpu_num;
+
+	do {
+		ret = HYPERVISOR_platform_op(&op);
+	} while (ret == -EBUSY);
+	if (ret)
+		return ret;
+
+	if (max_id)
+		*max_id = op.u.pcpu_info.max_present;
+
+	pcpu = get_pcpu(cpu_num);
+
+	if (info->flags & XEN_PCPU_FLAGS_INVALID) {
+		/* The pcpu has been removed */
+		if (pcpu) {
+			xen_pcpu_free(pcpu);
+			return PCPU_REMOVED;
+		}
+		return PCPU_NO_CHANGE;
+	}
+
+
+	if (!pcpu) {
+		pcpu = init_pcpu(info);
+		if (!IS_ERR(pcpu))
+			return PCPU_ADDED;
+		pr_warn("Failed to init pCPU %#x (%ld)\n",
+			info->xen_cpuid, PTR_ERR(pcpu));
+		return PTR_ERR(pcpu);
+	}
+
+	if (!same_pcpu(info, pcpu)) {
+		/*
+		 * Old pCPU is replaced by a new one, which means
+		 * several vIRQ-s were missed - can this happen?
+		 */
+		pr_warn("pCPU %#x changed!\n", pcpu->dev.id);
+		pcpu->apic_id = info->apic_id;
+		pcpu->acpi_id = info->acpi_id;
+	}
+	if (xen_pcpu_online_check(info, pcpu))
+		return PCPU_ONLINE_OFFLINE;
+	return PCPU_NO_CHANGE;
+}
+
+/*
+ * Sync dom0's pcpu information with xen hypervisor's
+ */
+static int xen_sync_pcpus(void)
+{
+	/*
+	 * Boot cpu always have cpu_id 0 in xen
+	 */
+	unsigned int cpu_num = 0, max_id = 0;
+	int result = 0;
+
+	get_pcpu_lock();
+
+	while ((result >= 0) && (cpu_num <= max_id)) {
+		result = _sync_pcpu(cpu_num, &max_id);
+
+		switch (result)	{
+		case PCPU_NO_CHANGE:
+		case PCPU_ADDED:
+		case PCPU_ONLINE_OFFLINE:
+		case PCPU_REMOVED:
+			break;
+		default:
+			pr_warn("Failed to sync pcpu %#x\n", cpu_num);
+			break;
+		}
+		cpu_num++;
+	}
+
+	if (result < 0) {
+		struct pcpu *pcpu, *tmp;
+
+		list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, pcpu_list)
+			xen_pcpu_free(pcpu);
+	}
+
+	put_pcpu_lock();
+
+	return result;
+}
+
+static void xen_pcpu_dpc(struct work_struct *work)
+{
+	if (xen_sync_pcpus() < 0)
+		pr_warn("xen_pcpu_dpc: Failed to sync pcpu information\n");
+}
+static DECLARE_WORK(xen_pcpu_work, xen_pcpu_dpc);
+
+static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
+{
+	schedule_work(&xen_pcpu_work);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+
+int xen_pcpu_hotplug(int type)
+{
+	schedule_work(&xen_pcpu_work);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_hotplug);
+
+int xen_pcpu_index(uint32_t id, bool is_acpiid)
+{
+	unsigned int cpu_num, max_id;
+	xen_platform_op_t op;
+	struct xenpf_pcpuinfo *info = &op.u.pcpu_info;
+
+	op.cmd = XENPF_get_cpuinfo;
+	for (max_id = cpu_num = 0; cpu_num <= max_id; ++cpu_num) {
+		int ret;
+
+		info->xen_cpuid = cpu_num;
+		do {
+			ret = HYPERVISOR_platform_op(&op);
+		} while (ret == -EBUSY);
+		if (ret)
+			continue;
+
+		if (info->max_present > max_id)
+			max_id = info->max_present;
+		if (id == (is_acpiid ? info->acpi_id : info->apic_id))
+			return cpu_num;
+	}
+
+	return -1;
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_index);
+
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+
+static int __init xen_pcpu_init(void)
+{
+	int err;
+
+	if (!is_initial_xendomain())
+		return 0;
+
+	err = subsys_system_register(&xen_pcpu_subsys, NULL);
+	if (err) {
+		pr_warn("xen_pcpu_init: "
+			"Failed to register subsys (%d)\n", err);
+		return err;
+	}
+
+	xen_sync_pcpus();
+
+	if (!list_empty(&xen_pcpus))
+		err = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0,
+					      xen_pcpu_interrupt, 0,
+					      "pcpu", NULL);
+	if (err < 0)
+		pr_warn("xen_pcpu_init: "
+			"Failed to bind virq (%d)\n", err);
+
+	return err;
+}
+subsys_initcall(xen_pcpu_init);
diff --git a/drivers/xen/core/reboot.c b/drivers/xen/core/reboot.c
new file mode 100644
index 0000000..90789f1
--- /dev/null
+++ b/drivers/xen/core/reboot.c
@@ -0,0 +1,348 @@
+#include <linux/kernel.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#undef handle_sysrq
+#endif
+
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+#define SHUTDOWN_RESUMING  3
+#define SHUTDOWN_HALT      4
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+
+/* Can we leave APs online when we suspend? */
+static int fast_suspend;
+
+static void __shutdown_handler(struct work_struct *unused);
+static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
+
+int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
+
+static int shutdown_process(void *__unused)
+{
+	static char *envp[] = { "HOME=/", "TERM=linux",
+				"PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
+	static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
+
+	extern asmlinkage long sys_reboot(int magic1, int magic2,
+					  unsigned int cmd, void *arg);
+
+	if ((shutting_down == SHUTDOWN_POWEROFF) ||
+	    (shutting_down == SHUTDOWN_HALT)) {
+		if (call_usermodehelper("/sbin/poweroff", poweroff_argv,
+					envp, 0) < 0) {
+#ifdef CONFIG_XEN
+			sys_reboot(LINUX_REBOOT_MAGIC1,
+				   LINUX_REBOOT_MAGIC2,
+				   LINUX_REBOOT_CMD_POWER_OFF,
+				   NULL);
+#endif /* CONFIG_XEN */
+		}
+	}
+
+	shutting_down = SHUTDOWN_INVALID; /* could try again */
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+
+static int setup_suspend_evtchn(void);
+
+/* Was last suspend request cancelled? */
+static int suspend_cancelled;
+
+static void xen_resume_notifier(int _suspend_cancelled)
+{
+	int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
+	BUG_ON(old_state != SHUTDOWN_SUSPEND);
+	suspend_cancelled = _suspend_cancelled;
+}
+
+static int xen_suspend(void *__unused)
+{
+	int err, old_state;
+
+	daemonize("suspend");
+	err = set_cpus_allowed_ptr(current, cpumask_of(0));
+	if (err) {
+		pr_err("Xen suspend can't run on CPU0 (%d)\n", err);
+		goto fail;
+	}
+
+	do {
+		err = __xen_suspend(fast_suspend, xen_resume_notifier);
+		if (err) {
+			pr_err("Xen suspend failed (%d)\n", err);
+			goto fail;
+		}
+		if (!suspend_cancelled)
+			setup_suspend_evtchn();
+		old_state = cmpxchg(
+			&shutting_down, SHUTDOWN_RESUMING, SHUTDOWN_INVALID);
+	} while (old_state == SHUTDOWN_SUSPEND);
+
+	switch (old_state) {
+	case SHUTDOWN_INVALID:
+	case SHUTDOWN_SUSPEND:
+		BUG();
+	case SHUTDOWN_RESUMING:
+		break;
+	default:
+		schedule_delayed_work(&shutdown_work, 0);
+		break;
+	}
+
+	return 0;
+
+ fail:
+	old_state = xchg(&shutting_down, SHUTDOWN_INVALID);
+	BUG_ON(old_state != SHUTDOWN_SUSPEND);
+	return 0;
+}
+
+#else
+# define xen_suspend NULL
+#endif
+
+static void switch_shutdown_state(int new_state)
+{
+	int prev_state, old_state = SHUTDOWN_INVALID;
+
+	/* We only drive shutdown_state into an active state. */
+	if (new_state == SHUTDOWN_INVALID)
+		return;
+
+	do {
+		/* We drop this transition if already in an active state. */
+		if ((old_state != SHUTDOWN_INVALID) &&
+		    (old_state != SHUTDOWN_RESUMING))
+			return;
+		/* Attempt to transition. */
+		prev_state = old_state;
+		old_state = cmpxchg(&shutting_down, old_state, new_state);
+	} while (old_state != prev_state);
+
+	/* Either we kick off the work, or we leave it to xen_suspend(). */
+	if (old_state == SHUTDOWN_INVALID)
+		schedule_delayed_work(&shutdown_work, 0);
+	else
+		BUG_ON(old_state != SHUTDOWN_RESUMING);
+}
+
+static void __shutdown_handler(struct work_struct *unused)
+{
+	int err;
+
+	err = kernel_thread((shutting_down == SHUTDOWN_SUSPEND) ?
+			    xen_suspend : shutdown_process,
+			    NULL, CLONE_FS | CLONE_FILES);
+
+	if (err < 0) {
+		pr_warning("Error creating shutdown process (%d): "
+			   "retrying...\n", -err);
+		schedule_delayed_work(&shutdown_work, HZ/2);
+	}
+}
+
+static void shutdown_handler(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	extern void ctrl_alt_del(void);
+	char *str;
+	struct xenbus_transaction xbt;
+	int err, new_state = SHUTDOWN_INVALID;
+
+	if ((shutting_down != SHUTDOWN_INVALID) &&
+	    (shutting_down != SHUTDOWN_RESUMING))
+		return;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+
+	str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+	/* Ignore read errors and empty reads. */
+	if (XENBUS_IS_ERR_READ(str)) {
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	xenbus_write(xbt, "control", "shutdown", "");
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN) {
+		kfree(str);
+		goto again;
+	}
+
+	if (strcmp(str, "poweroff") == 0)
+		new_state = SHUTDOWN_POWEROFF;
+	else if (strcmp(str, "reboot") == 0)
+		ctrl_alt_del();
+#ifdef CONFIG_PM_SLEEP
+	else if (strcmp(str, "suspend") == 0)
+		new_state = SHUTDOWN_SUSPEND;
+#endif
+	else if (strcmp(str, "halt") == 0)
+		new_state = SHUTDOWN_HALT;
+	else
+		pr_warning("Ignoring shutdown request: %s\n", str);
+
+	switch_shutdown_state(new_state);
+
+	kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+			  unsigned int len)
+{
+	char sysrq_key = '\0';
+	struct xenbus_transaction xbt;
+	int err;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+	if (xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key) <= 0) {
+		pr_err("Unable to read sysrq code in control/sysrq\n");
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	if (sysrq_key != '\0')
+		xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+
+#ifdef CONFIG_MAGIC_SYSRQ
+	if (sysrq_key != '\0')
+		handle_sysrq(sysrq_key);
+#endif
+}
+
+static struct xenbus_watch shutdown_watch = {
+	.node = "control/shutdown",
+	.callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+	.node = "control/sysrq",
+	.callback = sysrq_handler
+};
+
+#ifdef CONFIG_PM_SLEEP
+static irqreturn_t suspend_int(int irq, void* dev_id)
+{
+	switch_shutdown_state(SHUTDOWN_SUSPEND);
+	return IRQ_HANDLED;
+}
+
+static int setup_suspend_evtchn(void)
+{
+	static int irq;
+	int port;
+	char portstr[16];
+
+	if (irq > 0)
+		unbind_from_irqhandler(irq, NULL);
+
+	irq = bind_listening_port_to_irqhandler(0, suspend_int, 0, "suspend",
+						NULL);
+	if (irq <= 0)
+		return -1;
+
+	port = irq_to_evtchn_port(irq);
+	pr_info("suspend: event channel %d\n", port);
+	sprintf(portstr, "%d", port);
+	xenbus_write(XBT_NIL, "device/suspend", "event-channel", portstr);
+
+	return 0;
+}
+#else
+#define setup_suspend_evtchn() 0
+#endif
+
+static int setup_shutdown_watcher(void)
+{
+	int err;
+
+	err = register_xenbus_watch(&sysrq_watch);
+	if (err) {
+		pr_err("Failed to set sysrq watcher\n");
+		return err;
+	}
+
+	if (is_initial_xendomain())
+		return 0;
+
+	xenbus_scanf(XBT_NIL, "control",
+		     "platform-feature-multiprocessor-suspend",
+		     "%d", &fast_suspend);
+
+	err = register_xenbus_watch(&shutdown_watch);
+	if (err) {
+		pr_err("Failed to set shutdown watcher\n");
+		return err;
+	}
+
+	/* suspend event channel */
+	err = setup_suspend_evtchn();
+	if (err) {
+		pr_err("Failed to register suspend event channel\n");
+		return err;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_XEN
+
+static int shutdown_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *data)
+{
+	setup_shutdown_watcher();
+	return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = shutdown_event
+	};
+	register_xenstore_notifier(&xenstore_notifier);
+
+	return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
+
+#else /* !defined(CONFIG_XEN) */
+
+int xen_reboot_init(void)
+{
+	return setup_shutdown_watcher();
+}
+
+#endif /* !defined(CONFIG_XEN) */
diff --git a/drivers/xen/core/smpboot.c b/drivers/xen/core/smpboot.c
new file mode 100644
index 0000000..099401b
--- /dev/null
+++ b/drivers/xen/core/smpboot.c
@@ -0,0 +1,398 @@
+/*
+ *	Xen SMP booting functions
+ *
+ *	See arch/i386/kernel/smpboot.c for copyright and credits for derived
+ *	portions of this file.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/irq.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
+#include <asm/desc.h>
+#include <asm/pgalloc.h>
+#include <xen/clock.h>
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+#include <xen/cpu_hotplug.h>
+#include <xen/xenbus.h>
+
+extern int local_setup_timer(unsigned int cpu);
+extern void local_teardown_timer(unsigned int cpu);
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void system_call(void);
+extern void smp_trap_init(trap_info_t *);
+
+cpumask_var_t vcpu_initialized_mask;
+
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
+static int __read_mostly ipi_irq = -1;
+
+void __init prefill_possible_map(void)
+{
+	int i, rc;
+
+	for_each_possible_cpu(i)
+	    if (i != smp_processor_id())
+		return;
+
+	for (i = 0; i < NR_CPUS; i++) {
+#ifndef CONFIG_HOTPLUG_CPU
+		if (i >= setup_max_cpus)
+			break;
+#endif
+		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+		if (rc >= 0) {
+			set_cpu_possible(i, true);
+			nr_cpu_ids = i + 1;
+		}
+	}
+	total_cpus = num_possible_cpus();
+	for (; HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL) >= 0; ++i)
+		if (i != smp_processor_id())
+			++total_cpus;
+}
+
+static irqreturn_t ipi_interrupt(int irq, void *dev_id)
+{
+	static void (*const handlers[])(struct pt_regs *) = {
+		[RESCHEDULE_VECTOR] = smp_reschedule_interrupt,
+		[CALL_FUNCTION_VECTOR] = smp_call_function_interrupt,
+		[CALL_FUNC_SINGLE_VECTOR] = smp_call_function_single_interrupt,
+		[REBOOT_VECTOR] = smp_reboot_interrupt,
+#ifdef CONFIG_IRQ_WORK
+		[IRQ_WORK_VECTOR] = smp_irq_work_interrupt,
+#endif
+	};
+	unsigned long *pending = __get_cpu_var(ipi_pending);
+	struct pt_regs *regs = get_irq_regs();
+	irqreturn_t ret = IRQ_NONE;
+
+	for (;;) {
+		unsigned int ipi = find_first_bit(pending, NR_IPIS);
+
+		if (ipi >= NR_IPIS) {
+			clear_ipi_evtchn();
+			ipi = find_first_bit(pending, NR_IPIS);
+		}
+		if (ipi >= NR_IPIS)
+			return ret;
+		ret = IRQ_HANDLED;
+		do {
+			clear_bit(ipi, pending);
+			handlers[ipi](regs);
+			ipi = find_next_bit(pending, NR_IPIS, ipi);
+		} while (ipi < NR_IPIS);
+	}
+}
+
+static int __cpuinit xen_smp_intr_init(unsigned int cpu)
+{
+	static struct irqaction ipi_action = {
+		.handler = ipi_interrupt,
+		.flags   = IRQF_DISABLED,
+		.name    = "ipi"
+	};
+	int rc;
+
+	rc = bind_ipi_to_irqaction(cpu, &ipi_action);
+	if (rc < 0)
+		return rc;
+	if (ipi_irq < 0)
+		ipi_irq = rc;
+	else
+		BUG_ON(ipi_irq != rc);
+
+	rc = xen_spinlock_init(cpu);
+	if (rc < 0)
+		goto unbind_ipi;
+
+	if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
+		goto fail;
+
+	return 0;
+
+ fail:
+	xen_spinlock_cleanup(cpu);
+ unbind_ipi:
+	unbind_from_per_cpu_irq(ipi_irq, cpu, NULL);
+	return rc;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __cpuinit xen_smp_intr_exit(unsigned int cpu)
+{
+	if (cpu != 0)
+		local_teardown_timer(cpu);
+
+	unbind_from_per_cpu_irq(ipi_irq, cpu, NULL);
+	xen_spinlock_cleanup(cpu);
+}
+#endif
+
+static void __cpuinit cpu_bringup(void)
+{
+	cpu_init();
+	identify_secondary_cpu(__this_cpu_ptr(&cpu_info));
+	touch_softlockup_watchdog();
+	preempt_disable();
+	xen_setup_cpu_clockevents();
+	local_irq_enable();
+}
+
+static void __cpuinit cpu_bringup_and_idle(void)
+{
+	cpu_bringup();
+	cpu_idle();
+}
+
+static void __cpuinit cpu_initialize_context(unsigned int cpu)
+{
+	/* vcpu_guest_context_t is too large to allocate on the stack.
+	 * Hence we allocate statically and protect it with a lock */
+	static vcpu_guest_context_t ctxt;
+	static DEFINE_SPINLOCK(ctxt_lock);
+
+	struct task_struct *idle = idle_task(cpu);
+
+	if (cpumask_test_and_set_cpu(cpu, vcpu_initialized_mask))
+		return;
+
+	spin_lock(&ctxt_lock);
+
+	memset(&ctxt, 0, sizeof(ctxt));
+
+	ctxt.flags = VGCF_IN_KERNEL;
+	ctxt.user_regs.ds = __USER_DS;
+	ctxt.user_regs.es = __USER_DS;
+	ctxt.user_regs.ss = __KERNEL_DS;
+	ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+	ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
+
+	smp_trap_init(ctxt.trap_ctxt);
+
+	ctxt.gdt_frames[0] = arbitrary_virt_to_mfn(get_cpu_gdt_table(cpu));
+	ctxt.gdt_ents = GDT_SIZE / 8;
+
+	ctxt.user_regs.cs = __KERNEL_CS;
+	ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+
+	ctxt.kernel_ss = __KERNEL_DS;
+	ctxt.kernel_sp = idle->thread.sp0;
+
+	ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
+	ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+#ifdef __i386__
+	ctxt.event_callback_cs     = __KERNEL_CS;
+	ctxt.failsafe_callback_cs  = __KERNEL_CS;
+
+	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+	ctxt.user_regs.fs = __KERNEL_PERCPU;
+	ctxt.user_regs.gs = __KERNEL_STACK_CANARY;
+#else /* __x86_64__ */
+	ctxt.syscall_callback_eip  = (unsigned long)system_call;
+
+	ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
+
+	ctxt.gs_base_kernel = per_cpu_offset(cpu);
+#endif
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt))
+		BUG();
+
+	spin_unlock(&ctxt_lock);
+}
+
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+	unsigned int cpu;
+	struct task_struct *idle;
+	int apicid;
+	struct vcpu_get_physid cpu_id;
+	void *gdt_addr;
+
+	apicid = 0;
+	if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0)
+		apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
+	cpu_data(0) = boot_cpu_data;
+	current_thread_info()->cpu = 0;
+
+	if (xen_smp_intr_init(0))
+		BUG();
+
+	if (!alloc_cpumask_var(&vcpu_initialized_mask, GFP_KERNEL))
+		BUG();
+	cpumask_copy(vcpu_initialized_mask, cpumask_of(0));
+
+	/* Restrict the possible_map according to max_cpus. */
+	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+		for (cpu = nr_cpu_ids-1; !cpu_possible(cpu); cpu--)
+			continue;
+		set_cpu_possible(cpu, false);
+	}
+
+	for_each_possible_cpu (cpu) {
+		if (cpu == 0)
+			continue;
+
+		idle = fork_idle(cpu);
+		if (IS_ERR(idle))
+			panic("failed fork for CPU %d", cpu);
+
+		gdt_addr = get_cpu_gdt_table(cpu);
+		make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
+
+		apicid = cpu;
+		if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0)
+			apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
+		cpu_data(cpu) = boot_cpu_data;
+		cpu_data(cpu).cpu_index = cpu;
+
+#ifdef __x86_64__
+		clear_tsk_thread_flag(idle, TIF_FORK);
+		per_cpu(kernel_stack, cpu) =
+			(unsigned long)task_stack_page(idle) -
+			KERNEL_STACK_OFFSET + THREAD_SIZE;
+#endif
+	 	per_cpu(current_task, cpu) = idle;
+
+		irq_ctx_init(cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
+		if (is_initial_xendomain())
+#endif
+			set_cpu_present(cpu, true);
+	}
+
+	init_xenbus_allowed_cpumask();
+
+#ifdef CONFIG_X86_IO_APIC
+	/*
+	 * Here we can be sure that there is an IO-APIC in the system. Let's
+	 * go and set it up:
+	 */
+	if (cpu_has_apic && !skip_ioapic_setup && nr_ioapics)
+		setup_IO_APIC();
+#endif
+}
+
+void __init smp_prepare_boot_cpu(void)
+{
+	unsigned int cpu;
+
+	switch_to_new_gdt(smp_processor_id());
+	prefill_possible_map();
+	for_each_possible_cpu(cpu)
+		if (cpu != smp_processor_id())
+			setup_vcpu_info(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
+ * But do it early enough to catch critical for_each_present_cpu() loops
+ * in i386-specific code.
+ */
+static int __init initialize_cpu_present_map(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		set_cpu_present(cpu, true);
+
+	return 0;
+}
+core_initcall(initialize_cpu_present_map);
+
+int __cpuinit __cpu_disable(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	if (cpu == 0)
+		return -EBUSY;
+
+	set_cpu_online(cpu, false);
+	fixup_irqs();
+
+	return 0;
+}
+
+void __cpuinit __cpu_die(unsigned int cpu)
+{
+	while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule_timeout(HZ/10);
+	}
+
+	xen_smp_intr_exit(cpu);
+
+	if (num_online_cpus() == 1)
+		alternatives_smp_switch(0);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __cpuinit __cpu_up(unsigned int cpu)
+{
+	int rc;
+
+	rc = cpu_up_check(cpu);
+	if (rc)
+		return rc;
+
+	cpu_initialize_context(cpu);
+
+	if (num_online_cpus() == 1)
+		alternatives_smp_switch(1);
+
+	/* This must be done before setting cpu_online_map */
+	wmb();
+
+	rc = xen_smp_intr_init(cpu);
+	if (rc)
+		return rc;
+
+	set_cpu_online(cpu, true);
+
+	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+	BUG_ON(rc);
+
+	return 0;
+}
+
+void __ref play_dead(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	idle_task_exit();
+	local_irq_disable();
+	cpumask_clear_cpu(smp_processor_id(), cpu_initialized_mask);
+	preempt_enable_no_resched();
+	VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
+	cpu_bringup();
+#else
+	BUG();
+#endif
+}
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+	nmi_selftest();
+}
+
+#ifndef CONFIG_X86_LOCAL_APIC
+int setup_profiling_timer(unsigned int multiplier)
+{
+	return -EINVAL;
+}
+#endif
diff --git a/drivers/xen/core/spinlock.c b/drivers/xen/core/spinlock.c
new file mode 100644
index 0000000..355eed6
--- /dev/null
+++ b/drivers/xen/core/spinlock.c
@@ -0,0 +1,408 @@
+/*
+ *	Xen spinlock functions
+ *
+ *	See arch/x86/xen/smp.c for copyright and credits for derived
+ *	portions of this file.
+ */
+#define XEN_SPINLOCK_SOURCE
+#include <linux/spinlock_types.h>
+
+#ifdef TICKET_SHIFT
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <asm/hardirq.h>
+#include <xen/clock.h>
+#include <xen/evtchn.h>
+
+struct spinning {
+	arch_spinlock_t *lock;
+	unsigned int ticket;
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+	unsigned int irq_count;
+#endif
+	struct spinning *prev;
+};
+static DEFINE_PER_CPU(struct spinning *, _spinning);
+static DEFINE_PER_CPU_READ_MOSTLY(evtchn_port_t, poll_evtchn);
+/*
+ * Protect removal of objects: Addition can be done lockless, and even
+ * removal itself doesn't need protection - what needs to be prevented is
+ * removed objects going out of scope (as they're allocated on the stack).
+ */
+struct rm_seq {
+	unsigned int idx;
+#define SEQ_REMOVE_BIAS (1 << !!CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING)
+	atomic_t ctr[2];
+};
+static DEFINE_PER_CPU(struct rm_seq, rm_seq);
+
+int __cpuinit xen_spinlock_init(unsigned int cpu)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	int rc;
+
+	setup_runstate_area(cpu);
+
+ 	WARN_ON(per_cpu(poll_evtchn, cpu));
+	bind_ipi.vcpu = cpu;
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
+	if (!rc)
+	 	per_cpu(poll_evtchn, cpu) = bind_ipi.port;
+	else
+		pr_warning("No spinlock poll event channel for CPU#%u (%d)\n",
+			   cpu, rc);
+
+	return rc;
+}
+
+void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
+{
+	struct evtchn_close close;
+
+	close.port = per_cpu(poll_evtchn, cpu);
+ 	per_cpu(poll_evtchn, cpu) = 0;
+	WARN_ON(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close));
+}
+
+#ifdef CONFIG_PM_SLEEP
+#include <linux/syscore_ops.h>
+
+static void __cpuinit spinlock_resume(void)
+{
+	unsigned int cpu;
+
+	for_each_online_cpu(cpu) {
+		per_cpu(poll_evtchn, cpu) = 0;
+		xen_spinlock_init(cpu);
+	}
+}
+
+static struct syscore_ops __cpuinitdata spinlock_syscore_ops = {
+	.resume	= spinlock_resume
+};
+
+static int __init spinlock_register(void)
+{
+	if (!is_initial_xendomain())
+		register_syscore_ops(&spinlock_syscore_ops);
+	return 0;
+}
+core_initcall(spinlock_register);
+#endif
+
+static inline void sequence(unsigned int bias)
+{
+	unsigned int rm_idx = percpu_read(rm_seq.idx);
+
+	smp_wmb();
+	percpu_write(rm_seq.idx, (rm_idx + bias) ^ (SEQ_REMOVE_BIAS / 2));
+	smp_mb();
+	rm_idx &= 1;
+	while (percpu_read(rm_seq.ctr[rm_idx].counter))
+		cpu_relax();
+}
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+static DEFINE_PER_CPU(unsigned int, _irq_count);
+
+static __ticket_t spin_adjust(struct spinning *spinning,
+			      const arch_spinlock_t *lock,
+			      __ticket_t ticket)
+{
+	for (; spinning; spinning = spinning->prev) {
+		unsigned int old = spinning->ticket;
+
+		if (spinning->lock != lock)
+			continue;
+		while (likely(old + 1)) {
+			unsigned int cur;
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING > 1
+			ticket = spin_adjust(spinning->prev, lock, ticket);
+#endif
+			cur = cmpxchg(&spinning->ticket, old, ticket);
+			if (cur == old)
+				return cur;
+			old = cur;
+		}
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING == 1
+		break;
+#endif
+	}
+	return ticket;
+}
+
+struct __raw_tickets xen_spin_adjust(const arch_spinlock_t *lock,
+				     struct __raw_tickets token)
+{
+	token.tail = spin_adjust(percpu_read(_spinning), lock, token.tail);
+	token.head = ACCESS_ONCE(lock->tickets.head);
+	return token;
+}
+
+static unsigned int ticket_drop(struct spinning *spinning,
+				unsigned int ticket, unsigned int cpu)
+{
+	arch_spinlock_t *lock = spinning->lock;
+
+	if (cmpxchg(&spinning->ticket, ticket, -1) != ticket)
+		return -1;
+	lock->owner = cpu;
+	__add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
+	ticket = (__ticket_t)(ticket + 1);
+	return ticket != lock->tickets.tail ? ticket : -1;
+}
+
+static unsigned int ticket_get(arch_spinlock_t *lock, struct spinning *prev)
+{
+	struct __raw_tickets token = xadd(&lock->tickets,
+				 	  (struct __raw_tickets){ .tail = 1 });
+
+	return token.head == token.tail ? token.tail
+					: spin_adjust(prev, lock, token.tail);
+}
+
+void xen_spin_irq_enter(void)
+{
+	struct spinning *spinning = percpu_read(_spinning);
+	unsigned int cpu = raw_smp_processor_id();
+
+	percpu_inc(_irq_count);
+	smp_mb();
+	for (; spinning; spinning = spinning->prev) {
+		arch_spinlock_t *lock = spinning->lock;
+
+		/*
+		 * Return the ticket if we now own the lock. While just being
+		 * desirable generally (to reduce latency on spinning CPUs),
+		 * this is essential in the case where interrupts get
+		 * re-enabled in xen_spin_wait().
+		 * Try to get a new ticket right away (to reduce latency after
+		 * the current lock was released), but don't acquire the lock.
+		 */
+		while (lock->tickets.head == spinning->ticket) {
+			unsigned int ticket = ticket_drop(spinning,
+							  spinning->ticket,
+							  cpu);
+
+			if (!(ticket + 1))
+				break;
+			xen_spin_kick(lock, ticket);
+			spinning->ticket = ticket_get(lock, spinning->prev);
+			smp_mb();
+		}
+	}
+}
+
+void xen_spin_irq_exit(void)
+{
+	struct spinning *spinning = percpu_read(_spinning);
+	unsigned int cpu = raw_smp_processor_id();
+	/*
+	 * Despite its counterpart being first in xen_spin_irq_enter() (to make
+	 * xen_spin_kick() properly handle locks that get owned after their
+	 * tickets were obtained there), it can validly be done first here:
+	 * We're guaranteed to see another invocation of xen_spin_irq_enter()
+	 * if any of the tickets need to be dropped again.
+	 */
+	unsigned int irq_count = this_cpu_dec_return(_irq_count);
+
+	/*
+	 * Make sure all xen_spin_kick() instances which may still have seen
+	 * our old IRQ count exit their critical region (so that we won't fail
+	 * to re-obtain a ticket if ticket_drop() completes only after our
+	 * ticket check below).
+	 */
+	sequence(0);
+
+	/*
+	 * Obtain new tickets for (or acquire) all those locks at the IRQ
+	 * nesting level we are about to return to where above we avoided
+	 * acquiring them.
+	 */
+	for (; spinning; spinning = spinning->prev) {
+		arch_spinlock_t *lock = spinning->lock;
+
+		if (spinning->irq_count < irq_count)
+			break;
+		if (spinning->ticket + 1)
+			continue;
+		spinning->ticket = ticket_get(lock, spinning->prev);
+		if (ACCESS_ONCE(lock->tickets.head) == spinning->ticket)
+			lock->owner = cpu;
+	}
+}
+#endif
+
+unsigned int xen_spin_wait(arch_spinlock_t *lock, struct __raw_tickets *ptok,
+			   unsigned int flags)
+{
+	unsigned int cpu = raw_smp_processor_id();
+	typeof(vcpu_info(0)->evtchn_upcall_mask) upcall_mask
+		= arch_local_save_flags();
+	struct spinning spinning;
+
+	/* If kicker interrupt not initialized yet, just spin. */
+	if (unlikely(!cpu_online(cpu)) || unlikely(!percpu_read(poll_evtchn)))
+		return UINT_MAX;
+
+	/* announce we're spinning */
+	spinning.ticket = ptok->tail;
+	spinning.lock = lock;
+	spinning.prev = percpu_read(_spinning);
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+	spinning.irq_count = UINT_MAX;
+	if (upcall_mask > flags) {
+		const struct spinning *other;
+		int nesting = CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING;
+
+		for (other = spinning.prev; other; other = other->prev)
+			if (other->lock == lock && !--nesting) {
+				flags = upcall_mask;
+				break;
+			}
+	}
+	arch_local_irq_disable();
+#endif
+	smp_wmb();
+	percpu_write(_spinning, &spinning);
+
+	for (;;) {
+		clear_evtchn(percpu_read(poll_evtchn));
+
+		/*
+		 * Check again to make sure it didn't become free while
+		 * we weren't looking.
+		 */
+		if (lock->tickets.head == spinning.ticket) {
+			/*
+			 * If we interrupted another spinlock while it was
+			 * blocking, make sure it doesn't block (again)
+			 * without rechecking the lock.
+			 */
+			if (spinning.prev)
+				set_evtchn(percpu_read(poll_evtchn));
+			break;
+		}
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+		if (upcall_mask > flags) {
+			spinning.irq_count = percpu_read(_irq_count);
+			smp_wmb();
+			arch_local_irq_restore(flags);
+		}
+#endif
+
+		if (!test_evtchn(percpu_read(poll_evtchn)) &&
+		    HYPERVISOR_poll_no_timeout(&__get_cpu_var(poll_evtchn), 1))
+			BUG();
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+		arch_local_irq_disable();
+		smp_wmb();
+		spinning.irq_count = UINT_MAX;
+#endif
+
+		if (test_evtchn(percpu_read(poll_evtchn))) {
+			inc_irq_stat(irq_lock_count);
+			break;
+		}
+	}
+
+	/*
+	 * Leave the event pending so that any interrupted blocker will
+	 * re-check.
+	 */
+
+	/* announce we're done */
+	percpu_write(_spinning, spinning.prev);
+	if (!CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING)
+		arch_local_irq_disable();
+	sequence(SEQ_REMOVE_BIAS);
+	arch_local_irq_restore(upcall_mask);
+	smp_rmb();
+	if (lock->tickets.head == spinning.ticket) {
+		lock->owner = cpu;
+		return 0;
+	}
+	BUG_ON(CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING && !(spinning.ticket + 1));
+	ptok->head = lock->tickets.head;
+	ptok->tail = spinning.ticket;
+
+	return 1 << 10;
+}
+
+void xen_spin_kick(const arch_spinlock_t *lock, unsigned int ticket)
+{
+	unsigned int cpu = raw_smp_processor_id(), anchor = cpu;
+
+	if (unlikely(!cpu_online(cpu)))
+		cpu = -1, anchor = nr_cpu_ids;
+
+	while ((cpu = cpumask_next(cpu, cpu_online_mask)) != anchor) {
+		unsigned int flags;
+		atomic_t *rm_ctr;
+		struct spinning *spinning;
+
+		if (cpu >= nr_cpu_ids) {
+			if (anchor == nr_cpu_ids)
+				return;
+			cpu = cpumask_first(cpu_online_mask);
+			if (cpu == anchor)
+				return;
+		}
+
+		flags = arch_local_irq_save();
+		for (;;) {
+			unsigned int rm_idx = per_cpu(rm_seq.idx, cpu);
+
+			rm_ctr = per_cpu(rm_seq.ctr, cpu) + (rm_idx & 1);
+			atomic_inc(rm_ctr);
+#ifdef CONFIG_X86 /* atomic ops are full CPU barriers */
+			barrier();
+#else
+			smp_mb();
+#endif
+			spinning = per_cpu(_spinning, cpu);
+			smp_rmb();
+			if ((rm_idx ^ per_cpu(rm_seq.idx, cpu))
+			    < SEQ_REMOVE_BIAS)
+				break;
+			atomic_dec(rm_ctr);
+			if (!vcpu_running(cpu))
+				HYPERVISOR_yield();
+		}
+
+		for (; spinning; spinning = spinning->prev)
+			if (spinning->lock == lock &&
+			    spinning->ticket == ticket) {
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+				ticket = spinning->irq_count
+					 < per_cpu(_irq_count, cpu)
+					 ? ticket_drop(spinning, ticket, cpu) : -2;
+#endif
+				break;
+			}
+
+		atomic_dec(rm_ctr);
+		arch_local_irq_restore(flags);
+
+		if (unlikely(spinning)) {
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+			if (!(ticket + 1))
+				return;
+			if (ticket + 2) {
+				cpu = anchor < nr_cpu_ids ? anchor : -1;
+				continue;
+			}
+#endif
+			notify_remote_via_evtchn(per_cpu(poll_evtchn, cpu));
+			return;
+		}
+	}
+}
+EXPORT_SYMBOL(xen_spin_kick);
+
+#endif /* TICKET_SHIFT */
diff --git a/drivers/xen/core/xen_proc.c b/drivers/xen/core/xen_proc.c
new file mode 100644
index 0000000..b4f1136
--- /dev/null
+++ b/drivers/xen/core/xen_proc.c
@@ -0,0 +1,30 @@
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <xen/xen_proc.h>
+
+static struct proc_dir_entry *xen_base;
+
+struct proc_dir_entry *
+#ifndef MODULE
+__init
+#endif
+create_xen_proc_entry(const char *name, mode_t mode)
+{
+	if ( xen_base == NULL )
+		if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
+			panic("Couldn't create /proc/xen");
+	return create_proc_entry(name, mode, xen_base);
+}
+
+#ifdef MODULE
+#include <linux/export.h>
+
+EXPORT_SYMBOL_GPL(create_xen_proc_entry); 
+#elif defined(CONFIG_XEN_PRIVILEGED_GUEST)
+
+void remove_xen_proc_entry(const char *name)
+{
+	remove_proc_entry(name, xen_base);
+}
+
+#endif
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index b1f60a0..58f917f 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -49,9 +49,15 @@
 #include <linux/cpu.h>
 
 #include <xen/xen.h>
+#ifdef CONFIG_PARAVIRT_XEN
 #include <xen/events.h>
 #include <xen/evtchn.h>
 #include <asm/xen/hypervisor.h>
+#else
+#include <xen/evtchn.h>
+#include <xen/public/evtchn.h>
+#define bind_evtchn_to_irqhandler bind_caller_port_to_irqhandler
+#endif
 
 struct per_user_data {
 	struct mutex bind_mutex; /* serialize bind/unbind operations */
@@ -278,6 +284,9 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
 	int irq = irq_from_evtchn(port);
 
 	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+#ifdef CONFIG_XEN
+	WARN_ON(close_evtchn(port));
+#endif
 
 	set_port_user(port, NULL);
 }
@@ -450,7 +459,8 @@ static int evtchn_open(struct inode *inode, struct file *filp)
 	if (u == NULL)
 		return -ENOMEM;
 
-	u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
+	u->name = kasprintf(GFP_KERNEL, "evtchn:%s[%d]",
+			    current->comm, current->pid);
 	if (u->name == NULL) {
 		kfree(u);
 		return -ENOMEM;
@@ -518,7 +528,12 @@ static const struct file_operations evtchn_fops = {
 
 static struct miscdevice evtchn_miscdev = {
 	.minor        = MISC_DYNAMIC_MINOR,
+#ifdef CONFIG_PARAVIRT_XEN
 	.name         = "xen/evtchn",
+#else
+	.name         = "evtchn",
+#endif
+	.nodename     = "xen/evtchn",
 	.fops         = &evtchn_fops,
 };
 static int __init evtchn_init(void)
@@ -534,10 +549,10 @@ static int __init evtchn_init(void)
 
 	spin_lock_init(&port_user_lock);
 
-	/* Create '/dev/misc/evtchn'. */
+	/* Create '/dev/xen/evtchn'. */
 	err = misc_register(&evtchn_miscdev);
 	if (err != 0) {
-		printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+		pr_alert("Could not register /dev/xen/evtchn\n");
 		return err;
 	}
 
@@ -558,3 +573,4 @@ module_init(evtchn_init);
 module_exit(evtchn_cleanup);
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("devname:xen/evtchn");
diff --git a/drivers/xen/fbfront/Makefile b/drivers/xen/fbfront/Makefile
new file mode 100644
index 0000000..e2b8909
--- /dev/null
+++ b/drivers/xen/fbfront/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_XEN_FRAMEBUFFER)	:= xenfb.o
+obj-$(CONFIG_XEN_KEYBOARD)	+= xenkbd.o
diff --git a/drivers/xen/fbfront/xenfb.c b/drivers/xen/fbfront/xenfb.c
new file mode 100644
index 0000000..b199b6b
--- /dev/null
+++ b/drivers/xen/fbfront/xenfb.c
@@ -0,0 +1,910 @@
+/*
+ * linux/drivers/video/xenfb.c -- Xen para-virtual frame buffer device
+ *
+ * Copyright (C) 2005-2006 Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
+ *
+ *  Based on linux/drivers/video/q40fb.c
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License. See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+/*
+ * TODO:
+ *
+ * Switch to grant tables when they become capable of dealing with the
+ * frame buffer.
+ */
+
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fb.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/interface/io/fbif.h>
+#include <xen/interface/io/protocols.h>
+#include <xen/xenbus.h>
+#include <linux/kthread.h>
+
+struct xenfb_mapping
+{
+	struct list_head	link;
+	struct vm_area_struct	*vma;
+	atomic_t		map_refs;
+	int			faults;
+	struct xenfb_info	*info;
+};
+
+struct xenfb_info
+{
+	struct task_struct	*kthread;
+	wait_queue_head_t	wq;
+
+	unsigned char		*fb;
+	struct fb_info		*fb_info;
+	struct timer_list	refresh;
+	int			dirty;
+	int			x1, y1, x2, y2;	/* dirty rectangle,
+						   protected by dirty_lock */
+	spinlock_t		dirty_lock;
+	struct mutex		mm_lock;
+	int			nr_pages;
+	struct page		**pages;
+	struct list_head	mappings; /* protected by mm_lock */
+
+	int			irq;
+	struct xenfb_page	*page;
+	unsigned long 		*mfns;
+	int			feature_resize; /* Backend has resize feature */
+	struct xenfb_resize	resize;
+	int			resize_dpy;
+	spinlock_t		resize_lock;
+
+	struct xenbus_device	*xbdev;
+};
+
+/*
+ * There are three locks:
+ *    spinlock resize_lock protecting resize_dpy and resize
+ *    spinlock dirty_lock protecting the dirty rectangle
+ *    mutex mm_lock protecting mappings.
+ *
+ * How the dirty and mapping locks work together
+ *
+ * The problem is that dirty rectangle and mappings aren't
+ * independent: the dirty rectangle must cover all faulted pages in
+ * mappings.  We need to prove that our locking maintains this
+ * invariant.
+ *
+ * There are several kinds of critical regions:
+ *
+ * 1. Holding only dirty_lock: xenfb_refresh().  May run in
+ *    interrupts.  Extends the dirty rectangle.  Trivially preserves
+ *    invariant.
+ *
+ * 2. Holding only mm_lock: xenfb_mmap() and xenfb_vm_close().  Touch
+ *    only mappings.  The former creates unfaulted pages.  Preserves
+ *    invariant.  The latter removes pages.  Preserves invariant.
+ *
+ * 3. Holding both locks: xenfb_vm_fault().  Extends the dirty
+ *    rectangle and updates mappings consistently.  Preserves
+ *    invariant.
+ *
+ * 4. The ugliest one: xenfb_update_screen().  Clear the dirty
+ *    rectangle and update mappings consistently.
+ *
+ *    We can't simply hold both locks, because zap_page_range() cannot
+ *    be called with a spinlock held.
+ *
+ *    Therefore, we first clear the dirty rectangle with both locks
+ *    held.  Then we unlock dirty_lock and update the mappings.
+ *    Critical regions that hold only dirty_lock may interfere with
+ *    that.  This can only be region 1: xenfb_refresh().  But that
+ *    just extends the dirty rectangle, which can't harm the
+ *    invariant.
+ *
+ * But FIXME: the invariant is too weak.  It misses that the fault
+ * record in mappings must be consistent with the mapping of pages in
+ * the associated address space!  __do_fault() updates the PTE after
+ * xenfb_vm_fault() returns, i.e. outside the critical region.  This
+ * allows the following race:
+ *
+ * X writes to some address in the Xen frame buffer
+ * Fault - call __do_fault()
+ *     call xenfb_vm_fault()
+ *         grab mm_lock
+ *         map->faults++;
+ *         release mm_lock
+ *     return back to do_no_page()
+ * (preempted, or SMP)
+ * Xen worker thread runs.
+ *      grab mm_lock
+ *      look at mappings
+ *          find this mapping, zaps its pages (but page not in pte yet)
+ *          clear map->faults
+ *      releases mm_lock
+ * (back to X process)
+ *     put page in X's pte
+ *
+ * Oh well, we wont be updating the writes to this page anytime soon.
+ */
+#define MB_ (1024*1024)
+#define XENFB_DEFAULT_FB_LEN (XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8)
+
+enum {KPARAM_MEM, KPARAM_WIDTH, KPARAM_HEIGHT, KPARAM_CNT};
+static int video[KPARAM_CNT] = {2, XENFB_WIDTH, XENFB_HEIGHT};
+module_param_array(video, int, NULL, 0);
+MODULE_PARM_DESC(video,
+		"Size of video memory in MB and width,height in pixels, default = (2,800,600)");
+
+static int xenfb_fps = 20;
+
+static int xenfb_remove(struct xenbus_device *);
+static void xenfb_init_shared_page(struct xenfb_info *, struct fb_info *);
+static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *);
+static void xenfb_disconnect_backend(struct xenfb_info *);
+
+static void xenfb_send_event(struct xenfb_info *info,
+		union xenfb_out_event *event)
+{
+	__u32 prod;
+
+	prod = info->page->out_prod;
+	/* caller ensures !xenfb_queue_full() */
+	mb();			/* ensure ring space available */
+	XENFB_OUT_RING_REF(info->page, prod) = *event;
+	wmb();			/* ensure ring contents visible */
+	info->page->out_prod = prod + 1;
+
+	notify_remote_via_irq(info->irq);
+}
+
+static void xenfb_do_update(struct xenfb_info *info,
+			    int x, int y, int w, int h)
+{
+	union xenfb_out_event event;
+
+	memset(&event, 0, sizeof(event));
+	event.type = XENFB_TYPE_UPDATE;
+	event.update.x = x;
+	event.update.y = y;
+	event.update.width = w;
+	event.update.height = h;
+
+	/* caller ensures !xenfb_queue_full() */
+	xenfb_send_event(info, &event);
+}
+
+static void xenfb_do_resize(struct xenfb_info *info)
+{
+	union xenfb_out_event event;
+
+	memset(&event, 0, sizeof(event));
+	event.resize = info->resize;
+
+	/* caller ensures !xenfb_queue_full() */
+	xenfb_send_event(info, &event);
+}
+
+static int xenfb_queue_full(struct xenfb_info *info)
+{
+	__u32 cons, prod;
+
+	prod = info->page->out_prod;
+	cons = info->page->out_cons;
+	return prod - cons == XENFB_OUT_RING_LEN;
+}
+
+static void xenfb_update_screen(struct xenfb_info *info)
+{
+	unsigned long flags;
+	int y1, y2, x1, x2;
+	struct xenfb_mapping *map;
+
+	if (xenfb_queue_full(info))
+		return;
+
+	mutex_lock(&info->mm_lock);
+
+	spin_lock_irqsave(&info->dirty_lock, flags);
+	if (info->dirty){
+		info->dirty = 0;
+		y1 = info->y1;
+		y2 = info->y2;
+		x1 = info->x1;
+		x2 = info->x2;
+		info->x1 = info->y1 = INT_MAX;
+		info->x2 = info->y2 = 0;
+	} else {
+		spin_unlock_irqrestore(&info->dirty_lock, flags);
+		mutex_unlock(&info->mm_lock);
+		return;
+	}
+	spin_unlock_irqrestore(&info->dirty_lock, flags);
+
+	list_for_each_entry(map, &info->mappings, link) {
+		if (!map->faults)
+			continue;
+		zap_page_range(map->vma, map->vma->vm_start,
+			       map->vma->vm_end - map->vma->vm_start, NULL);
+		map->faults = 0;
+	}
+
+	mutex_unlock(&info->mm_lock);
+
+	if (x2 < x1 || y2 < y1) {
+		pr_warning("xenfb_update_screen bogus rect %d %d %d %d\n",
+			   x1, x2, y1, y2);
+		WARN_ON(1);
+	}
+	xenfb_do_update(info, x1, y1, x2 - x1, y2 - y1);
+}
+
+static void xenfb_handle_resize_dpy(struct xenfb_info *info)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->resize_lock, flags);
+	if (info->resize_dpy) {
+		if (!xenfb_queue_full(info)) {
+			info->resize_dpy = 0;
+			xenfb_do_resize(info);
+		}
+	}
+	spin_unlock_irqrestore(&info->resize_lock, flags);
+}
+
+static int xenfb_thread(void *data)
+{
+	struct xenfb_info *info = data;
+
+	while (!kthread_should_stop()) {
+		xenfb_handle_resize_dpy(info);
+		xenfb_update_screen(info);
+		wait_event_interruptible(info->wq,
+			kthread_should_stop() || info->dirty);
+		try_to_freeze();
+	}
+	return 0;
+}
+
+static int xenfb_setcolreg(unsigned regno, unsigned red, unsigned green,
+			   unsigned blue, unsigned transp,
+			   struct fb_info *info)
+{
+	u32 v;
+
+	if (regno > info->cmap.len)
+		return 1;
+
+	red   >>= (16 - info->var.red.length);
+	green >>= (16 - info->var.green.length);
+	blue  >>= (16 - info->var.blue.length);
+
+	v = (red << info->var.red.offset) |
+	    (green << info->var.green.offset) |
+	    (blue << info->var.blue.offset);
+
+	/* FIXME is this sane?  check against xxxfb_setcolreg()!  */
+	switch (info->var.bits_per_pixel) {
+	case 16:
+	case 24:
+	case 32:
+		((u32 *)info->pseudo_palette)[regno] = v;
+		break;
+	}
+	
+	return 0;
+}
+
+static void xenfb_timer(unsigned long data)
+{
+	struct xenfb_info *info = (struct xenfb_info *)data;
+	wake_up(&info->wq);
+}
+
+static void __xenfb_refresh(struct xenfb_info *info,
+			    int x1, int y1, int w, int h)
+{
+	int y2, x2;
+
+	y2 = y1 + h;
+	x2 = x1 + w;
+
+	if (info->y1 > y1)
+		info->y1 = y1;
+	if (info->y2 < y2)
+		info->y2 = y2;
+	if (info->x1 > x1)
+		info->x1 = x1;
+	if (info->x2 < x2)
+		info->x2 = x2;
+	info->dirty = 1;
+
+	if (timer_pending(&info->refresh))
+		return;
+
+	mod_timer(&info->refresh, jiffies + HZ/xenfb_fps);
+}
+
+static void xenfb_refresh(struct xenfb_info *info,
+			  int x1, int y1, int w, int h)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->dirty_lock, flags);
+	__xenfb_refresh(info, x1, y1, w, h);
+	spin_unlock_irqrestore(&info->dirty_lock, flags);
+}
+
+static void xenfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
+{
+	struct xenfb_info *info = p->par;
+
+	cfb_fillrect(p, rect);
+	xenfb_refresh(info, rect->dx, rect->dy, rect->width, rect->height);
+}
+
+static void xenfb_imageblit(struct fb_info *p, const struct fb_image *image)
+{
+	struct xenfb_info *info = p->par;
+
+	cfb_imageblit(p, image);
+	xenfb_refresh(info, image->dx, image->dy, image->width, image->height);
+}
+
+static void xenfb_copyarea(struct fb_info *p, const struct fb_copyarea *area)
+{
+	struct xenfb_info *info = p->par;
+
+	cfb_copyarea(p, area);
+	xenfb_refresh(info, area->dx, area->dy, area->width, area->height);
+}
+
+static void xenfb_vm_open(struct vm_area_struct *vma)
+{
+	struct xenfb_mapping *map = vma->vm_private_data;
+	atomic_inc(&map->map_refs);
+}
+
+static void xenfb_vm_close(struct vm_area_struct *vma)
+{
+	struct xenfb_mapping *map = vma->vm_private_data;
+	struct xenfb_info *info = map->info;
+
+	mutex_lock(&info->mm_lock);
+	if (atomic_dec_and_test(&map->map_refs)) {
+		list_del(&map->link);
+		kfree(map);
+	}
+	mutex_unlock(&info->mm_lock);
+}
+
+static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct xenfb_mapping *map = vma->vm_private_data;
+	struct xenfb_info *info = map->info;
+	int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
+	unsigned long flags;
+	struct page *page;
+	int y1, y2;
+
+	if (pgnr >= info->nr_pages)
+		return VM_FAULT_SIGBUS;
+
+	mutex_lock(&info->mm_lock);
+	spin_lock_irqsave(&info->dirty_lock, flags);
+	page = info->pages[pgnr];
+	get_page(page);
+	map->faults++;
+
+	y1 = pgnr * PAGE_SIZE / info->fb_info->fix.line_length;
+	y2 = (pgnr * PAGE_SIZE + PAGE_SIZE - 1) / info->fb_info->fix.line_length;
+	if (y2 > info->fb_info->var.yres)
+		y2 = info->fb_info->var.yres;
+	__xenfb_refresh(info, 0, y1, info->fb_info->var.xres, y2 - y1);
+	spin_unlock_irqrestore(&info->dirty_lock, flags);
+	mutex_unlock(&info->mm_lock);
+
+	vmf->page = page;
+
+	return VM_FAULT_MINOR;
+}
+
+static struct vm_operations_struct xenfb_vm_ops = {
+	.open	= xenfb_vm_open,
+	.close	= xenfb_vm_close,
+	.fault	= xenfb_vm_fault,
+};
+
+static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
+{
+	struct xenfb_info *info = fb_info->par;
+	struct xenfb_mapping *map;
+	int map_pages;
+
+	if (!(vma->vm_flags & VM_WRITE))
+		return -EINVAL;
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	map_pages = (vma->vm_end - vma->vm_start + PAGE_SIZE-1) >> PAGE_SHIFT;
+	if (map_pages > info->nr_pages)
+		return -EINVAL;
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (map == NULL)
+		return -ENOMEM;
+
+	map->vma = vma;
+	map->faults = 0;
+	map->info = info;
+	atomic_set(&map->map_refs, 1);
+
+	mutex_lock(&info->mm_lock);
+	list_add(&map->link, &info->mappings);
+	mutex_unlock(&info->mm_lock);
+
+	vma->vm_ops = &xenfb_vm_ops;
+	vma->vm_flags |= (VM_DONTEXPAND | VM_RESERVED);
+	vma->vm_private_data = map;
+
+	return 0;
+}
+
+static int
+xenfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+	struct xenfb_info *xenfb_info;
+	int required_mem_len;
+
+	xenfb_info = info->par;
+
+	if (!xenfb_info->feature_resize) {
+		if (var->xres == video[KPARAM_WIDTH] &&
+			var->yres == video[KPARAM_HEIGHT] &&
+			var->bits_per_pixel == xenfb_info->page->depth) {
+			return 0;
+		}
+		return -EINVAL;
+	}
+
+	/* Can't resize past initial width and height */
+	if (var->xres > video[KPARAM_WIDTH] || var->yres > video[KPARAM_HEIGHT])
+		return -EINVAL;
+
+	required_mem_len = var->xres * var->yres * (xenfb_info->page->depth / 8);
+	if (var->bits_per_pixel == xenfb_info->page->depth &&
+		var->xres <= info->fix.line_length / (XENFB_DEPTH / 8) &&
+		required_mem_len <= info->fix.smem_len) {
+		var->xres_virtual = var->xres;
+		var->yres_virtual = var->yres;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+static int xenfb_set_par(struct fb_info *info)
+{
+	struct xenfb_info *xenfb_info;
+	unsigned long flags;
+
+	xenfb_info = info->par;
+
+	spin_lock_irqsave(&xenfb_info->resize_lock, flags);
+	xenfb_info->resize.type = XENFB_TYPE_RESIZE;
+	xenfb_info->resize.width = info->var.xres;
+	xenfb_info->resize.height = info->var.yres;
+	xenfb_info->resize.stride = info->fix.line_length;
+	xenfb_info->resize.depth = info->var.bits_per_pixel;
+	xenfb_info->resize.offset = 0;
+	xenfb_info->resize_dpy = 1;
+	spin_unlock_irqrestore(&xenfb_info->resize_lock, flags);
+	return 0;
+}
+
+static struct fb_ops xenfb_fb_ops = {
+	.owner		= THIS_MODULE,
+	.fb_setcolreg	= xenfb_setcolreg,
+	.fb_fillrect	= xenfb_fillrect,
+	.fb_copyarea	= xenfb_copyarea,
+	.fb_imageblit	= xenfb_imageblit,
+	.fb_mmap	= xenfb_mmap,
+	.fb_check_var	= xenfb_check_var,
+	.fb_set_par     = xenfb_set_par,
+};
+
+static irqreturn_t xenfb_event_handler(int rq, void *dev_id)
+{
+	/*
+	 * No in events recognized, simply ignore them all.
+	 * If you need to recognize some, see xenbkd's input_handler()
+	 * for how to do that.
+	 */
+	struct xenfb_info *info = dev_id;
+	struct xenfb_page *page = info->page;
+
+	if (page->in_cons != page->in_prod) {
+		info->page->in_cons = info->page->in_prod;
+		notify_remote_via_irq(info->irq);
+	}
+	return IRQ_HANDLED;
+}
+
+static unsigned long vmalloc_to_mfn(void *address)
+{
+	return pfn_to_mfn(vmalloc_to_pfn(address));
+}
+
+static __devinit void
+xenfb_make_preferred_console(void)
+{
+	struct console *c;
+
+	if (console_set_on_cmdline)
+		return;
+
+	console_lock();
+	for_each_console(c) {
+		if (!strcmp(c->name, "tty") && c->index == 0)
+			break;
+	}
+	console_unlock();
+	if (c) {
+		unregister_console(c);
+		c->flags |= CON_CONSDEV;
+		c->flags &= ~CON_PRINTBUFFER; /* don't print again */
+		register_console(c);
+	}
+}
+
+static int __devinit xenfb_probe(struct xenbus_device *dev,
+				 const struct xenbus_device_id *id)
+{
+	struct xenfb_info *info;
+	struct fb_info *fb_info;
+	int fb_size;
+	int val;
+	int ret;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (info == NULL) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+		return -ENOMEM;
+	}
+
+	/* Limit kernel param videoram amount to what is in xenstore */
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "videoram", "%d", &val) == 1) {
+		if (val < video[KPARAM_MEM])
+			video[KPARAM_MEM] = val;
+	}
+
+	/* If requested res does not fit in available memory, use default */
+	fb_size = video[KPARAM_MEM] * MB_;
+	if (video[KPARAM_WIDTH] * video[KPARAM_HEIGHT] * XENFB_DEPTH/8 > fb_size) {
+		video[KPARAM_WIDTH] = XENFB_WIDTH;
+		video[KPARAM_HEIGHT] = XENFB_HEIGHT;
+		fb_size = XENFB_DEFAULT_FB_LEN;
+	}
+
+	dev_set_drvdata(&dev->dev, info);
+	info->xbdev = dev;
+	info->irq = -1;
+	info->x1 = info->y1 = INT_MAX;
+	spin_lock_init(&info->dirty_lock);
+	spin_lock_init(&info->resize_lock);
+	mutex_init(&info->mm_lock);
+	init_waitqueue_head(&info->wq);
+	init_timer(&info->refresh);
+	info->refresh.function = xenfb_timer;
+	info->refresh.data = (unsigned long)info;
+	INIT_LIST_HEAD(&info->mappings);
+
+	info->fb = vzalloc(fb_size);
+	if (info->fb == NULL)
+		goto error_nomem;
+
+	info->nr_pages = (fb_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	info->pages = kmalloc(sizeof(struct page *) * info->nr_pages,
+			      GFP_KERNEL);
+	if (info->pages == NULL)
+		goto error_nomem;
+
+	info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
+	if (!info->mfns)
+		goto error_nomem;
+
+	/* set up shared page */
+	info->page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	if (!info->page)
+		goto error_nomem;
+
+	fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL);
+				/* see fishy hackery below */
+	if (fb_info == NULL)
+		goto error_nomem;
+
+	/* FIXME fishy hackery */
+	fb_info->pseudo_palette = fb_info->par;
+	fb_info->par = info;
+	/* /FIXME */
+	fb_info->screen_base = info->fb;
+
+	fb_info->fbops = &xenfb_fb_ops;
+	fb_info->var.xres_virtual = fb_info->var.xres = video[KPARAM_WIDTH];
+	fb_info->var.yres_virtual = fb_info->var.yres = video[KPARAM_HEIGHT];
+	fb_info->var.bits_per_pixel = XENFB_DEPTH;
+
+	fb_info->var.red = (struct fb_bitfield){16, 8, 0};
+	fb_info->var.green = (struct fb_bitfield){8, 8, 0};
+	fb_info->var.blue = (struct fb_bitfield){0, 8, 0};
+
+	fb_info->var.activate = FB_ACTIVATE_NOW;
+	fb_info->var.height = -1;
+	fb_info->var.width = -1;
+	fb_info->var.vmode = FB_VMODE_NONINTERLACED;
+
+	fb_info->fix.visual = FB_VISUAL_TRUECOLOR;
+	fb_info->fix.line_length = fb_info->var.xres * (XENFB_DEPTH / 8);
+	fb_info->fix.smem_start = 0;
+	fb_info->fix.smem_len = fb_size;
+	strcpy(fb_info->fix.id, "xen");
+	fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
+	fb_info->fix.accel = FB_ACCEL_NONE;
+
+	fb_info->flags = FBINFO_FLAG_DEFAULT;
+
+	ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
+	if (ret < 0) {
+		framebuffer_release(fb_info);
+		xenbus_dev_fatal(dev, ret, "fb_alloc_cmap");
+		goto error;
+	}
+
+	xenfb_init_shared_page(info, fb_info);
+
+	ret = register_framebuffer(fb_info);
+	if (ret) {
+		fb_dealloc_cmap(&info->fb_info->cmap);
+		framebuffer_release(fb_info);
+		xenbus_dev_fatal(dev, ret, "register_framebuffer");
+		goto error;
+	}
+	info->fb_info = fb_info;
+
+	ret = xenfb_connect_backend(dev, info);
+	if (ret < 0)
+		goto error;
+
+	xenfb_make_preferred_console();
+	return 0;
+
+ error_nomem:
+	ret = -ENOMEM;
+	xenbus_dev_fatal(dev, ret, "allocating device memory");
+ error:
+	xenfb_remove(dev);
+	return ret;
+}
+
+static int xenfb_resume(struct xenbus_device *dev)
+{
+	struct xenfb_info *info = dev_get_drvdata(&dev->dev);
+
+	xenfb_disconnect_backend(info);
+	xenfb_init_shared_page(info, info->fb_info);
+	return xenfb_connect_backend(dev, info);
+}
+
+static int xenfb_remove(struct xenbus_device *dev)
+{
+	struct xenfb_info *info = dev_get_drvdata(&dev->dev);
+
+	del_timer(&info->refresh);
+	if (info->kthread)
+		kthread_stop(info->kthread);
+	xenfb_disconnect_backend(info);
+	if (info->fb_info) {
+		unregister_framebuffer(info->fb_info);
+		fb_dealloc_cmap(&info->fb_info->cmap);
+		framebuffer_release(info->fb_info);
+	}
+	free_page((unsigned long)info->page);
+	vfree(info->mfns);
+	kfree(info->pages);
+	vfree(info->fb);
+	kfree(info);
+
+	return 0;
+}
+
+static void xenfb_init_shared_page(struct xenfb_info *info,
+                                   struct fb_info * fb_info)
+{
+	int i;
+	int epd = PAGE_SIZE / sizeof(info->mfns[0]);
+
+	for (i = 0; i < info->nr_pages; i++)
+		info->pages[i] = vmalloc_to_page(info->fb + i * PAGE_SIZE);
+
+	for (i = 0; i < info->nr_pages; i++)
+		info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE);
+
+	for (i = 0; i * epd < info->nr_pages; i++)
+		info->page->pd[i] = vmalloc_to_mfn(&info->mfns[i * epd]);
+
+	info->page->width = fb_info->var.xres;
+	info->page->height = fb_info->var.yres;
+	info->page->depth = fb_info->var.bits_per_pixel;
+	info->page->line_length = fb_info->fix.line_length;
+	info->page->mem_length = fb_info->fix.smem_len;
+	info->page->in_cons = info->page->in_prod = 0;
+	info->page->out_cons = info->page->out_prod = 0;
+}
+
+static int xenfb_connect_backend(struct xenbus_device *dev,
+				 struct xenfb_info *info)
+{
+	int ret, irq;
+	struct xenbus_transaction xbt;
+
+	irq = bind_listening_port_to_irqhandler(
+		dev->otherend_id, xenfb_event_handler, 0, "xenfb", info);
+	if (irq < 0) {
+		xenbus_dev_fatal(dev, irq,
+				 "bind_listening_port_to_irqhandler");
+		return irq;
+	}
+
+ again:
+	ret = xenbus_transaction_start(&xbt);
+	if (ret) {
+		xenbus_dev_fatal(dev, ret, "starting transaction");
+		goto unbind_irq;
+	}
+	ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
+			    virt_to_mfn(info->page));
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+			    irq_to_evtchn_port(irq));
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
+			    XEN_IO_PROTO_ABI_NATIVE);
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_printf(xbt, dev->nodename, "feature-update", "1");
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_transaction_end(xbt, 0);
+	if (ret) {
+		if (ret == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, ret, "completing transaction");
+		goto unbind_irq;
+	}
+
+	info->irq = irq;
+	xenbus_switch_state(dev, XenbusStateInitialised);
+	return 0;
+
+ error_xenbus:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, ret, "writing xenstore");
+ unbind_irq:
+	unbind_from_irqhandler(irq, info);
+	return ret;
+}
+
+static void xenfb_disconnect_backend(struct xenfb_info *info)
+{
+	if (info->irq >= 0)
+		unbind_from_irqhandler(info->irq, info);
+	info->irq = -1;
+}
+
+static void xenfb_backend_changed(struct xenbus_device *dev,
+				  enum xenbus_state backend_state)
+{
+	struct xenfb_info *info = dev_get_drvdata(&dev->dev);
+	int val;
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateInitWait:
+	InitWait:
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateConnected:
+		/*
+		 * Work around xenbus race condition: If backend goes
+		 * through InitWait to Connected fast enough, we can
+		 * get Connected twice here.
+		 */
+		if (dev->state != XenbusStateConnected)
+			goto InitWait; /* no InitWait seen yet, fudge it */
+
+
+		if (xenbus_scanf(XBT_NIL, dev->otherend,
+					"feature-resize", "%d", &val) < 0)
+			val = 0;
+		info->feature_resize = val;
+
+		if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+				 "request-update", "%d", &val) < 0)
+			val = 0;
+
+		if (val && !info->kthread) {
+			info->kthread = kthread_run(xenfb_thread, info,
+						    "xenfb thread");
+			if (IS_ERR(info->kthread)) {
+				info->kthread = NULL;
+				xenbus_dev_fatal(dev, PTR_ERR(info->kthread),
+						"xenfb_thread");
+			}
+		}
+		break;
+
+	case XenbusStateClosing:
+		// FIXME is this safe in any dev->state?
+		xenbus_frontend_closed(dev);
+		break;
+	}
+}
+
+static const struct xenbus_device_id xenfb_ids[] = {
+	{ "vfb" },
+	{ "" }
+};
+MODULE_ALIAS("xen:vfb");
+
+static DEFINE_XENBUS_DRIVER(xenfb, ,
+	.probe = xenfb_probe,
+	.remove = xenfb_remove,
+	.resume = xenfb_resume,
+	.otherend_changed = xenfb_backend_changed,
+);
+
+static int __init xenfb_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	/* Nothing to do if running in dom0. */
+	if (is_initial_xendomain())
+		return -ENODEV;
+
+	return xenbus_register_frontend(&xenfb_driver);
+}
+
+static void __exit xenfb_cleanup(void)
+{
+	return xenbus_unregister_driver(&xenfb_driver);
+}
+
+module_init(xenfb_init);
+module_exit(xenfb_cleanup);
+
+MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/fbfront/xenkbd.c b/drivers/xen/fbfront/xenkbd.c
new file mode 100644
index 0000000..a9ebe8f
--- /dev/null
+++ b/drivers/xen/fbfront/xenkbd.c
@@ -0,0 +1,366 @@
+/*
+ * linux/drivers/input/keyboard/xenkbd.c -- Xen para-virtual input device
+ *
+ * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
+ *
+ *  Based on linux/drivers/input/mouse/sermouse.c
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License. See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+/*
+ * TODO:
+ *
+ * Switch to grant tables together with xenfb.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/input.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/interface/io/fbif.h>
+#include <xen/interface/io/kbdif.h>
+#include <xen/xenbus.h>
+
+struct xenkbd_info
+{
+	struct input_dev *kbd;
+	struct input_dev *ptr;
+	struct xenkbd_page *page;
+	int irq;
+	struct xenbus_device *xbdev;
+	char phys[32];
+};
+
+static int xenkbd_remove(struct xenbus_device *);
+static int xenkbd_connect_backend(struct xenbus_device *, struct xenkbd_info *);
+static void xenkbd_disconnect_backend(struct xenkbd_info *);
+
+/*
+ * Note: if you need to send out events, see xenfb_do_update() for how
+ * to do that.
+ */
+
+static irqreturn_t input_handler(int rq, void *dev_id)
+{
+	struct xenkbd_info *info = dev_id;
+	struct xenkbd_page *page = info->page;
+	__u32 cons, prod;
+
+	prod = page->in_prod;
+	if (prod == page->in_cons)
+		return IRQ_HANDLED;
+	rmb();			/* ensure we see ring contents up to prod */
+	for (cons = page->in_cons; cons != prod; cons++) {
+		union xenkbd_in_event *event;
+		struct input_dev *dev;
+		event = &XENKBD_IN_RING_REF(page, cons);
+
+		dev = info->ptr;
+		switch (event->type) {
+		case XENKBD_TYPE_MOTION:
+			if (event->motion.rel_z)
+				input_report_rel(dev, REL_WHEEL,
+						 -event->motion.rel_z);
+			input_report_rel(dev, REL_X, event->motion.rel_x);
+			input_report_rel(dev, REL_Y, event->motion.rel_y);
+			break;
+		case XENKBD_TYPE_KEY:
+			dev = NULL;
+			if (test_bit(event->key.keycode, info->kbd->keybit))
+				dev = info->kbd;
+			if (test_bit(event->key.keycode, info->ptr->keybit))
+				dev = info->ptr;
+			if (dev)
+				input_report_key(dev, event->key.keycode,
+						 event->key.pressed);
+			else
+				pr_warning("xenkbd: unhandled keycode 0x%x\n",
+					   event->key.keycode);
+			break;
+		case XENKBD_TYPE_POS:
+			if (event->pos.rel_z)
+				input_report_rel(dev, REL_WHEEL,
+						 -event->pos.rel_z);
+			input_report_abs(dev, ABS_X, event->pos.abs_x);
+			input_report_abs(dev, ABS_Y, event->pos.abs_y);
+			break;
+		}
+		if (dev)
+			input_sync(dev);
+	}
+	mb();			/* ensure we got ring contents */
+	page->in_cons = cons;
+	notify_remote_via_irq(info->irq);
+
+	return IRQ_HANDLED;
+}
+
+int __devinit xenkbd_probe(struct xenbus_device *dev,
+			   const struct xenbus_device_id *id)
+{
+	int ret, i, abs;
+	struct xenkbd_info *info;
+	struct input_dev *kbd, *ptr;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+		return -ENOMEM;
+	}
+	dev_set_drvdata(&dev->dev, info);
+	info->xbdev = dev;
+	snprintf(info->phys, sizeof(info->phys), "xenbus/%s", dev->nodename);
+
+	info->page = (void *)__get_free_page(GFP_KERNEL);
+	if (!info->page)
+		goto error_nomem;
+	info->page->in_cons = info->page->in_prod = 0;
+	info->page->out_cons = info->page->out_prod = 0;
+
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-abs-pointer", "%d", &abs) < 0)
+		abs = 0;
+	if (abs)
+		xenbus_printf(XBT_NIL, dev->nodename, "request-abs-pointer", "1");
+
+	/* keyboard */
+	kbd = input_allocate_device();
+	if (!kbd)
+		goto error_nomem;
+	kbd->name = "Xen Virtual Keyboard";
+	kbd->phys = info->phys;
+	kbd->id.bustype = BUS_PCI;
+	kbd->id.vendor = 0x5853;
+	kbd->id.product = 0xffff;
+	__set_bit(EV_KEY, kbd->evbit);
+	for (i = KEY_ESC; i < KEY_UNKNOWN; i++)
+		__set_bit(i, kbd->keybit);
+	for (i = KEY_OK; i < KEY_MAX; i++)
+		__set_bit(i, kbd->keybit);
+
+	ret = input_register_device(kbd);
+	if (ret) {
+		input_free_device(kbd);
+		xenbus_dev_fatal(dev, ret, "input_register_device(kbd)");
+		goto error;
+	}
+	info->kbd = kbd;
+
+	/* pointing device */
+	ptr = input_allocate_device();
+	if (!ptr)
+		goto error_nomem;
+	ptr->name = "Xen Virtual Pointer";
+	ptr->phys = info->phys;
+	ptr->id.bustype = BUS_PCI;
+	ptr->id.vendor = 0x5853;
+	ptr->id.product = 0xfffe;
+
+	if (abs) {
+		__set_bit(EV_ABS, ptr->evbit);
+		input_set_abs_params(ptr, ABS_X, 0, XENFB_WIDTH, 0, 0);
+		input_set_abs_params(ptr, ABS_Y, 0, XENFB_HEIGHT, 0, 0);
+	} else {
+		input_set_capability(ptr, EV_REL, REL_X);
+		input_set_capability(ptr, EV_REL, REL_Y);
+	}
+	input_set_capability(ptr, EV_REL, REL_WHEEL);
+
+	__set_bit(EV_KEY, ptr->evbit);
+	for (i = BTN_LEFT; i <= BTN_TASK; i++)
+		__set_bit(i, ptr->keybit);
+
+	ret = input_register_device(ptr);
+	if (ret) {
+		input_free_device(ptr);
+		xenbus_dev_fatal(dev, ret, "input_register_device(ptr)");
+		goto error;
+	}
+	info->ptr = ptr;
+
+	ret = xenkbd_connect_backend(dev, info);
+	if (ret < 0)
+		goto error;
+
+	return 0;
+
+ error_nomem:
+	ret = -ENOMEM;
+	xenbus_dev_fatal(dev, ret, "allocating device memory");
+ error:
+	xenkbd_remove(dev);
+	return ret;
+}
+
+static int xenkbd_resume(struct xenbus_device *dev)
+{
+	struct xenkbd_info *info = dev_get_drvdata(&dev->dev);
+
+	xenkbd_disconnect_backend(info);
+	info->page->in_cons = info->page->in_prod = 0;
+	info->page->out_cons = info->page->out_prod = 0;
+	return xenkbd_connect_backend(dev, info);
+}
+
+static int xenkbd_remove(struct xenbus_device *dev)
+{
+	struct xenkbd_info *info = dev_get_drvdata(&dev->dev);
+
+	xenkbd_disconnect_backend(info);
+	input_unregister_device(info->kbd);
+	input_unregister_device(info->ptr);
+	free_page((unsigned long)info->page);
+	kfree(info);
+	return 0;
+}
+
+static int xenkbd_connect_backend(struct xenbus_device *dev,
+				  struct xenkbd_info *info)
+{
+	int ret;
+	struct xenbus_transaction xbt;
+
+	ret = bind_listening_port_to_irqhandler(
+		dev->otherend_id, input_handler, 0, "xenkbd", info);
+	if (ret < 0) {
+		xenbus_dev_fatal(dev, ret,
+				 "bind_listening_port_to_irqhandler");
+		return ret;
+	}
+	info->irq = ret;
+
+ again:
+	ret = xenbus_transaction_start(&xbt);
+	if (ret) {
+		xenbus_dev_fatal(dev, ret, "starting transaction");
+		return ret;
+	}
+	ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
+			    virt_to_mfn(info->page));
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+			    irq_to_evtchn_port(info->irq));
+	if (ret)
+		goto error_xenbus;
+	ret = xenbus_transaction_end(xbt, 0);
+	if (ret) {
+		if (ret == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, ret, "completing transaction");
+		return ret;
+	}
+
+	xenbus_switch_state(dev, XenbusStateInitialised);
+	return 0;
+
+ error_xenbus:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, ret, "writing xenstore");
+	return ret;
+}
+
+static void xenkbd_disconnect_backend(struct xenkbd_info *info)
+{
+	if (info->irq >= 0)
+		unbind_from_irqhandler(info->irq, info);
+	info->irq = -1;
+}
+
+static void xenkbd_backend_changed(struct xenbus_device *dev,
+				   enum xenbus_state backend_state)
+{
+	struct xenkbd_info *info = dev_get_drvdata(&dev->dev);
+	int ret, val;
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateInitWait:
+	InitWait:
+		ret = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+				   "feature-abs-pointer", "%d", &val);
+		if (ret < 0)
+			val = 0;
+		if (val) {
+			ret = xenbus_printf(XBT_NIL, info->xbdev->nodename,
+					    "request-abs-pointer", "1");
+			if (ret)
+				; /* FIXME */
+		}
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateConnected:
+		/*
+		 * Work around xenbus race condition: If backend goes
+		 * through InitWait to Connected fast enough, we can
+		 * get Connected twice here.
+		 */
+		if (dev->state != XenbusStateConnected)
+			goto InitWait; /* no InitWait seen yet, fudge it */
+
+		/* Set input abs params to match backend screen res */
+		if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+				   "width", "%d", &val) > 0 )
+			input_set_abs_params(info->ptr, ABS_X, 0, val, 0, 0);
+
+		if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+				   "height", "%d", &val) > 0 )
+			input_set_abs_params(info->ptr, ABS_Y, 0, val, 0, 0);
+
+		break;
+
+	case XenbusStateClosing:
+		xenbus_frontend_closed(dev);
+		break;
+	}
+}
+
+static const struct xenbus_device_id xenkbd_ids[] = {
+	{ "vkbd" },
+	{ "" }
+};
+MODULE_ALIAS("xen:vkbd");
+
+static DEFINE_XENBUS_DRIVER(xenkbd, ,
+	.probe = xenkbd_probe,
+	.remove = xenkbd_remove,
+	.resume = xenkbd_resume,
+	.otherend_changed = xenkbd_backend_changed,
+);
+
+static int __init xenkbd_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	/* Nothing to do if running in dom0. */
+	if (is_initial_xendomain())
+		return -ENODEV;
+
+	return xenbus_register_frontend(&xenkbd_driver);
+}
+
+static void __exit xenkbd_cleanup(void)
+{
+	return xenbus_unregister_driver(&xenkbd_driver);
+}
+
+module_init(xenkbd_init);
+module_exit(xenkbd_cleanup);
+
+MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/features.c b/drivers/xen/features.c
index 99eda16..fece0f2 100644
--- a/drivers/xen/features.c
+++ b/drivers/xen/features.c
@@ -9,14 +9,21 @@
 #include <linux/cache.h>
 #include <linux/module.h>
 
+#ifdef CONFIG_PARAVIRT_XEN
 #include <asm/xen/hypercall.h>
+#else
+#include <asm/hypervisor.h>
+#endif
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
 
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>
 #include <xen/features.h>
 
 u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
-EXPORT_SYMBOL_GPL(xen_features);
+EXPORT_SYMBOL(xen_features);
 
 void xen_setup_features(void)
 {
diff --git a/drivers/xen/gntdev/Makefile b/drivers/xen/gntdev/Makefile
new file mode 100644
index 0000000..8bd8c62
--- /dev/null
+++ b/drivers/xen/gntdev/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_XEN_GRANT_DEV) := gntdev.o
diff --git a/drivers/xen/gntdev/gntdev.c b/drivers/xen/gntdev/gntdev.c
new file mode 100644
index 0000000..3ec5dbc
--- /dev/null
+++ b/drivers/xen/gntdev/gntdev.c
@@ -0,0 +1,1012 @@
+/******************************************************************************
+ * gntdev.c
+ * 
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/public/gntdev.h>
+
+
+#define DRIVER_AUTHOR "Derek G. Murray <Derek.Murray@cl.cam.ac.uk>"
+#define DRIVER_DESC   "User-space granted page access driver"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
+#define GNTDEV_NAME "gntdev"
+MODULE_ALIAS("devname:xen/" GNTDEV_NAME);
+
+#define MAX_GRANTS_LIMIT   1024
+#define DEFAULT_MAX_GRANTS 128
+
+/* A slot can be in one of three states:
+ *
+ * 0. GNTDEV_SLOT_INVALID:
+ *    This slot is not associated with a grant reference, and is therefore free
+ *    to be overwritten by a new grant reference.
+ *
+ * 1. GNTDEV_SLOT_NOT_YET_MAPPED:
+ *    This slot is associated with a grant reference (via the 
+ *    IOCTL_GNTDEV_MAP_GRANT_REF ioctl), but it has not yet been mmap()-ed.
+ *
+ * 2. GNTDEV_SLOT_MAPPED:
+ *    This slot is associated with a grant reference, and has been mmap()-ed.
+ */
+typedef enum gntdev_slot_state {
+	GNTDEV_SLOT_INVALID = 0,
+	GNTDEV_SLOT_NOT_YET_MAPPED,
+	GNTDEV_SLOT_MAPPED
+} gntdev_slot_state_t;
+
+#define GNTDEV_INVALID_HANDLE    -1
+#define GNTDEV_FREE_LIST_INVALID -1
+/* Each opened instance of gntdev is associated with a list of grants,
+ * represented by an array of elements of the following type,
+ * gntdev_grant_info_t.
+ */
+typedef struct gntdev_grant_info {
+	gntdev_slot_state_t state;
+	union {
+		uint32_t free_list_index;
+		struct {
+			domid_t domid;
+			grant_ref_t ref;
+			grant_handle_t kernel_handle;
+			grant_handle_t user_handle;
+			uint64_t dev_bus_addr;
+		} valid;
+	} u;
+} gntdev_grant_info_t;
+
+/* Private data structure, which is stored in the file pointer for files
+ * associated with this device.
+ */
+typedef struct gntdev_file_private_data {
+  
+	/* Array of grant information. */
+	gntdev_grant_info_t *grants;
+	uint32_t grants_size;
+
+	/* Read/write semaphore used to protect the grants array. */
+	struct rw_semaphore grants_sem;
+
+	/* An array of indices of free slots in the grants array.
+	 * N.B. An entry in this list may temporarily have the value
+	 * GNTDEV_FREE_LIST_INVALID if the corresponding slot has been removed
+	 * from the list by the contiguous allocator, but the list has not yet
+	 * been compressed. However, this is not visible across invocations of
+	 * the device.
+	 */
+	int32_t *free_list;
+	
+	/* The number of free slots in the grants array. */
+	uint32_t free_list_size;
+
+	/* Read/write semaphore used to protect the free list. */
+	struct rw_semaphore free_list_sem;
+	
+	/* Index of the next slot after the most recent contiguous allocation, 
+	 * for use in a next-fit allocator.
+	 */
+	uint32_t next_fit_index;
+
+	/* Used to map grants into the kernel, before mapping them into user
+	 * space.
+	 */
+	struct page **foreign_pages;
+
+} gntdev_file_private_data_t;
+
+/* Module lifecycle operations. */
+static int __init gntdev_init(void);
+static void __exit gntdev_exit(void);
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* File operations. */
+static int gntdev_open(struct inode *inode, struct file *flip);
+static int gntdev_release(struct inode *inode, struct file *flip);
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma);
+static long gntdev_ioctl(struct file *flip,
+			 unsigned int cmd, unsigned long arg);
+
+static const struct file_operations gntdev_fops = {
+	.owner = THIS_MODULE,
+	.open = gntdev_open,
+	.llseek = no_llseek,
+	.release = gntdev_release,
+	.mmap = gntdev_mmap,
+	.unlocked_ioctl = gntdev_ioctl
+};
+
+/* VM operations. */
+static void gntdev_vma_close(struct vm_area_struct *vma);
+static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr,
+			      pte_t *ptep, int is_fullmm);
+
+static struct vm_operations_struct gntdev_vmops = {
+	.close = gntdev_vma_close,
+	.zap_pte = gntdev_clear_pte
+};
+
+/* Memory mapping functions
+ * ------------------------
+ *
+ * Every granted page is mapped into both kernel and user space, and the two
+ * following functions return the respective virtual addresses of these pages.
+ *
+ * When shadow paging is disabled, the granted page is mapped directly into
+ * user space; when it is enabled, it is mapped into the kernel and remapped
+ * into user space using vm_insert_page() (see gntdev_mmap(), below).
+ */
+
+/* Returns the virtual address (in user space) of the @page_index'th page
+ * in the given VM area.
+ */
+static inline unsigned long get_user_vaddr (struct vm_area_struct *vma,
+					    int page_index)
+{
+	return (unsigned long) vma->vm_start + (page_index << PAGE_SHIFT);
+}
+
+/* Returns the virtual address (in kernel space) of the @slot_index'th page
+ * mapped by the gntdev instance that owns the given private data struct.
+ */
+static inline unsigned long get_kernel_vaddr (gntdev_file_private_data_t *priv,
+					      int slot_index)
+{
+	unsigned long pfn;
+	void *kaddr;
+	pfn = page_to_pfn(priv->foreign_pages[slot_index]);
+	kaddr = pfn_to_kaddr(pfn);
+	return (unsigned long) kaddr;
+}
+
+/* Helper functions. */
+
+/* Adds information about a grant reference to the list of grants in the file's
+ * private data structure. Returns non-zero on failure. On success, sets the
+ * value of *offset to the offset that should be mmap()-ed in order to map the
+ * grant reference.
+ */
+static int add_grant_reference(gntdev_file_private_data_t *private_data,
+			       struct ioctl_gntdev_grant_ref *op,
+			       uint64_t *offset)
+{
+	uint32_t slot_index;
+
+	slot_index = private_data->free_list[--private_data->free_list_size];
+	private_data->free_list[private_data->free_list_size]
+		= GNTDEV_FREE_LIST_INVALID;
+
+	/* Copy the grant information into file's private data. */
+	private_data->grants[slot_index].state = GNTDEV_SLOT_NOT_YET_MAPPED;
+	private_data->grants[slot_index].u.valid.domid = op->domid;
+	private_data->grants[slot_index].u.valid.ref = op->ref;
+
+	/* The offset is calculated as the index of the chosen entry in the
+	 * file's private data's array of grant information. This is then
+	 * shifted to give an offset into the virtual "file address space".
+	 */
+	*offset = slot_index << PAGE_SHIFT;
+
+	return 0;
+}
+
+/* Adds the @count grant references to the contiguous range in the slot array
+ * beginning at @first_slot. It is assumed that @first_slot was returned by a
+ * previous invocation of find_contiguous_free_range(), during the same
+ * invocation of the driver.
+ */
+static int add_grant_references(gntdev_file_private_data_t *private_data,
+				uint32_t count,
+				struct ioctl_gntdev_grant_ref *ops,
+				uint32_t first_slot)
+{
+	uint32_t i;
+	
+	for (i = 0; i < count; ++i) {
+
+		/* First, mark the slot's entry in the free list as invalid. */
+		uint32_t free_list_index =
+			private_data->grants[first_slot+i].u.free_list_index;
+		private_data->free_list[free_list_index] = 
+			GNTDEV_FREE_LIST_INVALID;
+
+		/* Now, update the slot. */
+		private_data->grants[first_slot+i].state = 
+			GNTDEV_SLOT_NOT_YET_MAPPED;
+		private_data->grants[first_slot+i].u.valid.domid =
+			ops[i].domid;
+		private_data->grants[first_slot+i].u.valid.ref = ops[i].ref;
+	}
+
+	return 0;	
+}
+
+/* Scans through the free list for @flip, removing entries that are marked as
+ * GNTDEV_SLOT_INVALID. This will reduce the recorded size of the free list to
+ * the number of valid entries.
+ */
+static void compress_free_list(gntdev_file_private_data_t *private_data)
+{
+	uint32_t i, j = 0, old_size;
+	
+	old_size = private_data->free_list_size;
+	for (i = 0; i < old_size; ++i) {
+		if (private_data->free_list[i] != GNTDEV_FREE_LIST_INVALID) {
+			if (i > j) {
+				int32_t slot_index;
+
+				slot_index = private_data->free_list[i];
+				private_data->free_list[j] = slot_index;
+				private_data->grants[slot_index].u
+					.free_list_index = j;
+				private_data->free_list[i] 
+					= GNTDEV_FREE_LIST_INVALID;
+			}
+			++j;
+		} else {
+			--private_data->free_list_size;
+		}
+	}
+}
+
+/* Searches the grant array in the private data of @flip for a range of
+ * @num_slots contiguous slots in the GNTDEV_SLOT_INVALID state.
+ *
+ * Returns the index of the first slot if a range is found, otherwise -ENOMEM.
+ */
+static int find_contiguous_free_range(gntdev_file_private_data_t *private_data,
+				      uint32_t num_slots) 
+{
+	uint32_t i, start_index = private_data->next_fit_index;
+	uint32_t range_start = 0, range_length;
+
+	/* First search from the start_index to the end of the array. */
+	range_length = 0;
+	for (i = start_index; i < private_data->grants_size; ++i) {
+		if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) {
+			if (range_length == 0) {
+				range_start = i;
+			}
+			++range_length;
+			if (range_length == num_slots) {
+				return range_start;
+			}
+		}
+	}
+	
+	/* Now search from the start of the array to the start_index. */
+	range_length = 0;
+	for (i = 0; i < start_index; ++i) {
+		if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) {
+			if (range_length == 0) {
+				range_start = i;
+			}
+			++range_length;
+			if (range_length == num_slots) {
+				return range_start;
+			}
+		}
+	}
+	
+	return -ENOMEM;
+}
+
+static int init_private_data(gntdev_file_private_data_t *priv,
+			     uint32_t max_grants)
+{
+	int i;
+
+	/* Allocate space for the kernel-mapping of granted pages. */
+	priv->foreign_pages = 
+		alloc_empty_pages_and_pagevec(max_grants);
+	if (!priv->foreign_pages)
+		goto nomem_out;
+
+	/* Allocate the grant list and free-list. */
+	priv->grants = kmalloc(max_grants * sizeof(gntdev_grant_info_t),
+			       GFP_KERNEL);
+	if (!priv->grants)
+		goto nomem_out2;
+	priv->free_list = kmalloc(max_grants * sizeof(int32_t), GFP_KERNEL);
+	if (!priv->free_list)
+		goto nomem_out3;
+
+	/* Initialise the free-list, which contains all slots at first. */
+	for (i = 0; i < max_grants; ++i) {
+		priv->free_list[max_grants - i - 1] = i;
+		priv->grants[i].state = GNTDEV_SLOT_INVALID;
+		priv->grants[i].u.free_list_index = max_grants - i - 1;
+	}
+	priv->grants_size = max_grants;
+	priv->free_list_size = max_grants;
+	priv->next_fit_index = 0;
+
+	return 0;
+
+nomem_out3:
+	kfree(priv->grants);
+nomem_out2:
+	free_empty_pages_and_pagevec(priv->foreign_pages, max_grants);
+nomem_out:
+	return -ENOMEM;
+
+}
+
+/* Interface functions. */
+
+static struct miscdevice gntdev_miscdev = {
+	.minor        = MISC_DYNAMIC_MINOR,
+	.name         = GNTDEV_NAME,
+	.nodename     = "xen/" GNTDEV_NAME,
+	.fops         = &gntdev_fops,
+};
+
+/* Initialises the driver. Called when the module is loaded. */
+static int __init gntdev_init(void)
+{
+	int err;
+
+	if (!is_running_on_xen()) {
+		pr_err("You must be running Xen to use gntdev\n");
+		return -ENODEV;
+	}
+
+	err = misc_register(&gntdev_miscdev);
+	if (err)
+	{
+		pr_err("Could not register gntdev device\n");
+		return err;
+	}
+
+	return 0;
+}
+
+/* Cleans up and unregisters the driver. Called when the driver is unloaded.
+ */
+static void __exit gntdev_exit(void)
+{
+	misc_deregister(&gntdev_miscdev);
+}
+
+/* Called when the device is opened. */
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+	gntdev_file_private_data_t *private_data;
+
+	nonseekable_open(inode, flip);
+
+	/* Allocate space for the per-instance private data. */
+	private_data = kmalloc(sizeof(*private_data), GFP_KERNEL);
+	if (!private_data)
+		goto nomem_out;
+
+	/* These will be lazily initialised by init_private_data. */
+	private_data->grants = NULL;
+	private_data->free_list = NULL;
+	private_data->foreign_pages = NULL;
+
+	init_rwsem(&private_data->grants_sem);
+	init_rwsem(&private_data->free_list_sem);
+
+	flip->private_data = private_data;
+
+	return 0;
+
+nomem_out:
+	return -ENOMEM;
+}
+
+/* Called when the device is closed.
+ */
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+	if (flip->private_data) {
+		gntdev_file_private_data_t *private_data = 
+			(gntdev_file_private_data_t *) flip->private_data;
+		if (private_data->foreign_pages)
+			free_empty_pages_and_pagevec
+				(private_data->foreign_pages,
+				 private_data->grants_size);
+		if (private_data->grants) 
+			kfree(private_data->grants);
+		if (private_data->free_list)
+			kfree(private_data->free_list);
+		kfree(private_data);
+	}
+	return 0;
+}
+
+/* Called when an attempt is made to mmap() the device. The private data from
+ * @flip contains the list of grant references that can be mapped. The vm_pgoff
+ * field of @vma contains the index into that list that refers to the grant
+ * reference that will be mapped. Only mappings that are a multiple of
+ * PAGE_SIZE are handled.
+ */
+static int gntdev_mmap (struct file *flip, struct vm_area_struct *vma) 
+{
+	struct gnttab_map_grant_ref op;
+	unsigned long slot_index = vma->vm_pgoff;
+	unsigned long kernel_vaddr, user_vaddr;
+	uint32_t size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	uint64_t ptep;
+	int ret, exit_ret;
+	int flags;
+	int i;
+	struct page *page;
+	gntdev_file_private_data_t *private_data = flip->private_data;
+
+	if (unlikely(!private_data)) {
+		pr_err("file's private data is NULL\n");
+		return -EINVAL;
+	}
+
+	/* Test to make sure that the grants array has been initialised. */
+	down_read(&private_data->grants_sem);
+	if (unlikely(!private_data->grants)) {
+		up_read(&private_data->grants_sem);
+		pr_err("attempted to mmap before ioctl\n");
+		return -EINVAL;
+	}
+	up_read(&private_data->grants_sem);
+
+	if (unlikely((size <= 0) || 
+		     (size + slot_index) > private_data->grants_size)) {
+		pr_err("Invalid number of pages or offset"
+		       "(num_pages = %d, first_slot = %ld)\n",
+		       size, slot_index);
+		return -ENXIO;
+	}
+
+	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+		pr_err("writable mappings must be shared\n");
+		return -EINVAL;
+	}
+
+	/* Slots must be in the NOT_YET_MAPPED state. */
+	down_write(&private_data->grants_sem);
+	for (i = 0; i < size; ++i) {
+		if (private_data->grants[slot_index + i].state != 
+		    GNTDEV_SLOT_NOT_YET_MAPPED) {
+			pr_err("Slot (index = %ld) is in the wrong "
+			       "state (%d)\n", slot_index + i,
+			       private_data->grants[slot_index + i].state);
+			up_write(&private_data->grants_sem);
+			return -EINVAL;
+		}
+	}
+
+	/* Install the hook for unmapping. */
+	vma->vm_ops = &gntdev_vmops;
+    
+	/* The VM area contains pages from another VM. */
+	vma->vm_flags |= VM_FOREIGN;
+	vma->vm_private_data = kzalloc(size * sizeof(struct page *),
+				       GFP_KERNEL);
+	if (vma->vm_private_data == NULL) {
+		pr_err("couldn't allocate mapping structure for VM area\n");
+		return -ENOMEM;
+	}
+
+	/* This flag prevents Bad PTE errors when the memory is unmapped. */
+	vma->vm_flags |= VM_RESERVED;
+
+	/* This flag prevents this VM area being copied on a fork(). A better
+	 * behaviour might be to explicitly carry out the appropriate mappings
+	 * on fork(), but I don't know if there's a hook for this.
+	 */
+	vma->vm_flags |= VM_DONTCOPY;
+
+#ifdef CONFIG_X86
+	/* This flag ensures that the page tables are not unpinned before the
+	 * VM area is unmapped. Therefore Xen still recognises the PTE as
+	 * belonging to an L1 pagetable, and the grant unmap operation will
+	 * succeed, even if the process does not exit cleanly.
+	 */
+	vma->vm_mm->context.has_foreign_mappings = 1;
+#endif
+
+	exit_ret = -ENOMEM;
+	for (i = 0; i < size; ++i) {
+
+		flags = GNTMAP_host_map;
+		if (!(vma->vm_flags & VM_WRITE))
+			flags |= GNTMAP_readonly;
+
+		kernel_vaddr = get_kernel_vaddr(private_data, slot_index + i);
+		user_vaddr = get_user_vaddr(vma, i);
+		page = private_data->foreign_pages[slot_index + i];
+
+		gnttab_set_map_op(&op, kernel_vaddr, flags,   
+				  private_data->grants[slot_index+i]
+				  .u.valid.ref, 
+				  private_data->grants[slot_index+i]
+				  .u.valid.domid);
+
+		/* Carry out the mapping of the grant reference. */
+		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 
+						&op, 1);
+		BUG_ON(ret);
+		if (op.status != GNTST_okay) {
+			if (op.status != GNTST_eagain)
+				pr_err("Error mapping the grant reference "
+				       "into the kernel (%d). domid = %d; ref = %d\n",
+				       op.status,
+				       private_data->grants[slot_index+i]
+				       .u.valid.domid,
+				       private_data->grants[slot_index+i]
+				       .u.valid.ref);
+			else
+				/* Propagate eagain instead of trying to fix it up */
+				exit_ret = -EAGAIN;
+			goto undo_map_out;
+		}
+
+		/* Store a reference to the page that will be mapped into user
+		 * space.
+		 */
+		((struct page **) vma->vm_private_data)[i] = page;
+
+		/* Mark mapped page as reserved. */
+		SetPageReserved(page);
+
+		/* Record the grant handle, for use in the unmap operation. */
+		private_data->grants[slot_index+i].u.valid.kernel_handle = 
+			op.handle;
+		private_data->grants[slot_index+i].u.valid.dev_bus_addr = 
+			op.dev_bus_addr;
+		
+		private_data->grants[slot_index+i].state = GNTDEV_SLOT_MAPPED;
+		private_data->grants[slot_index+i].u.valid.user_handle =
+			GNTDEV_INVALID_HANDLE;
+
+		/* Now perform the mapping to user space. */
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+
+			/* NOT USING SHADOW PAGE TABLES. */
+			/* In this case, we map the grant(s) straight into user
+			 * space.
+			 */
+
+			/* Get the machine address of the PTE for the user 
+			 *  page.
+			 */
+			if ((ret = create_lookup_pte_addr(vma->vm_mm, 
+							  vma->vm_start 
+							  + (i << PAGE_SHIFT), 
+							  &ptep)))
+			{
+				pr_err("Error obtaining PTE pointer (%d)\n",
+				       ret);
+				goto undo_map_out;
+			}
+			
+			/* Configure the map operation. */
+		
+			/* The reference is to be used by host CPUs. */
+			flags = GNTMAP_host_map;
+			
+			/* Specifies a user space mapping. */
+			flags |= GNTMAP_application_map;
+			
+			/* The map request contains the machine address of the
+			 * PTE to update.
+			 */
+			flags |= GNTMAP_contains_pte;
+			
+			if (!(vma->vm_flags & VM_WRITE))
+				flags |= GNTMAP_readonly;
+
+			gnttab_set_map_op(&op, ptep, flags, 
+					  private_data->grants[slot_index+i]
+					  .u.valid.ref, 
+					  private_data->grants[slot_index+i]
+					  .u.valid.domid);
+
+			/* Carry out the mapping of the grant reference. */
+			ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+							&op, 1);
+			BUG_ON(ret);
+			if (op.status != GNTST_okay) {
+				pr_err("Error mapping the grant "
+				       "reference into user space (%d). domid "
+				       "= %d; ref = %d\n", op.status,
+				       private_data->grants[slot_index+i].u
+				       .valid.domid,
+				       private_data->grants[slot_index+i].u
+				       .valid.ref);
+				/* This should never happen after we've mapped into
+				* the kernel space. */
+				BUG_ON(op.status == GNTST_eagain);
+				goto undo_map_out;
+			}
+			
+			/* Record the grant handle, for use in the unmap 
+			 * operation. 
+			 */
+			private_data->grants[slot_index+i].u.
+				valid.user_handle = op.handle;
+
+			/* Update p2m structure with the new mapping. */
+			set_phys_to_machine(__pa(kernel_vaddr) >> PAGE_SHIFT,
+					    FOREIGN_FRAME(private_data->
+							  grants[slot_index+i]
+							  .u.valid.dev_bus_addr
+							  >> PAGE_SHIFT));
+		} else {
+			/* USING SHADOW PAGE TABLES. */
+			/* In this case, we simply insert the page into the VM
+			 * area. */
+			ret = vm_insert_page(vma, user_vaddr, page);
+		}
+
+	}
+	exit_ret = 0;
+
+	up_write(&private_data->grants_sem);
+	return exit_ret;
+
+undo_map_out:
+	/* If we have a mapping failure, the unmapping will be taken care of
+	 * by do_mmap_pgoff(), which will eventually call gntdev_clear_pte().
+	 * All we need to do here is free the vma_private_data.
+	 */
+	kfree(vma->vm_private_data);
+
+	/* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file
+	 * to NULL on failure. However, we need this in gntdev_clear_pte() to
+	 * unmap the grants. Therefore, we smuggle a reference to the file's
+	 * private data in the VM area's private data pointer.
+	 */
+	vma->vm_private_data = private_data;
+	
+	up_write(&private_data->grants_sem);
+
+	return exit_ret;
+}
+
+static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr,
+			      pte_t *ptep, int is_fullmm)
+{
+	int slot_index, ret;
+	pte_t copy;
+	struct gnttab_unmap_grant_ref op;
+	gntdev_file_private_data_t *private_data;
+
+	/* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file
+	 * to NULL on failure. However, we need this in gntdev_clear_pte() to
+	 * unmap the grants. Therefore, we smuggle a reference to the file's
+	 * private data in the VM area's private data pointer.
+	 */
+	if (vma->vm_file) {
+		private_data = (gntdev_file_private_data_t *)
+			vma->vm_file->private_data;
+	} else if (vma->vm_private_data) {
+		private_data = (gntdev_file_private_data_t *)
+			vma->vm_private_data;
+	} else {
+		private_data = NULL; /* gcc warning */
+		BUG();
+	}
+
+	/* Calculate the grant relating to this PTE. */
+	slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+
+	/* Only unmap grants if the slot has been mapped. This could be being
+	 * called from a failing mmap().
+	 */
+	if (private_data->grants[slot_index].state == GNTDEV_SLOT_MAPPED) {
+
+		/* First, we clear the user space mapping, if it has been made.
+		 */
+		if (private_data->grants[slot_index].u.valid.user_handle !=
+		    GNTDEV_INVALID_HANDLE && 
+		    !xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* NOT USING SHADOW PAGE TABLES. */
+
+			/* Copy the existing value of the PTE for returning. */
+			copy = *ptep;
+
+			gnttab_set_unmap_op(&op, ptep_to_machine(ptep), 
+					    GNTMAP_contains_pte,
+					    private_data->grants[slot_index]
+					    .u.valid.user_handle);
+			ret = HYPERVISOR_grant_table_op(
+				GNTTABOP_unmap_grant_ref, &op, 1);
+			BUG_ON(ret);
+			if (op.status != GNTST_okay)
+				pr_warning("User unmap grant status = %d\n",
+					   op.status);
+		} else {
+			/* USING SHADOW PAGE TABLES. */
+			copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
+		}
+
+		/* Finally, we unmap the grant from kernel space. */
+		gnttab_set_unmap_op(&op, 
+				    get_kernel_vaddr(private_data, slot_index),
+				    GNTMAP_host_map, 
+				    private_data->grants[slot_index].u.valid
+				    .kernel_handle);
+		ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 
+						&op, 1);
+		BUG_ON(ret);
+		if (op.status != GNTST_okay)
+			pr_warning("Kernel unmap grant status = %d\n",
+				   op.status);
+
+
+		/* Return slot to the not-yet-mapped state, so that it may be
+		 * mapped again, or removed by a subsequent ioctl.
+		 */
+		private_data->grants[slot_index].state = 
+			GNTDEV_SLOT_NOT_YET_MAPPED;
+
+		/* Invalidate the physical to machine mapping for this page. */
+		set_phys_to_machine(
+			page_to_pfn(private_data->foreign_pages[slot_index]),
+			INVALID_P2M_ENTRY);
+
+	} else {
+		copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
+	}
+
+	return copy;
+}
+
+/* "Destructor" for a VM area.
+ */
+static void gntdev_vma_close(struct vm_area_struct *vma) {
+	if (vma->vm_private_data) {
+		kfree(vma->vm_private_data);
+	}
+}
+
+/* Called when an ioctl is made on the device.
+ */
+static long gntdev_ioctl(struct file *flip,
+			 unsigned int cmd, unsigned long arg)
+{
+	int rc = 0;
+	gntdev_file_private_data_t *private_data = 
+		(gntdev_file_private_data_t *) flip->private_data;
+
+	/* On the first invocation, we will lazily initialise the grant array
+	 * and free-list.
+	 */
+	if (unlikely(!private_data->grants) 
+	    && likely(cmd != IOCTL_GNTDEV_SET_MAX_GRANTS)) {
+		down_write(&private_data->grants_sem);
+		
+		if (unlikely(private_data->grants)) {
+			up_write(&private_data->grants_sem);
+			goto private_data_initialised;
+		}
+		
+		/* Just use the default. Setting to a non-default is handled
+		 * in the ioctl switch.
+		 */
+		rc = init_private_data(private_data, DEFAULT_MAX_GRANTS);
+		
+		up_write(&private_data->grants_sem);
+
+		if (rc) {
+			pr_err("Initialising gntdev private data failed\n");
+			return rc;
+		}
+	}
+	    
+private_data_initialised:
+	switch (cmd) {
+	case IOCTL_GNTDEV_MAP_GRANT_REF:
+	{
+		struct ioctl_gntdev_map_grant_ref op;
+		struct ioctl_gntdev_grant_ref *refs = NULL;
+
+		if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
+			return -EFAULT;
+		if (unlikely(op.count <= 0))
+			return -EINVAL;
+
+		if (op.count > 1 && op.count <= private_data->grants_size) {
+			struct ioctl_gntdev_grant_ref *u;
+
+			refs = kmalloc(op.count * sizeof(*refs), GFP_KERNEL);
+			if (!refs)
+				return -ENOMEM;
+			u = ((struct ioctl_gntdev_map_grant_ref *)arg)->refs;
+			if (copy_from_user(refs, (void __user *)u,
+					   sizeof(*refs) * op.count)) {
+				kfree(refs);
+				return -EFAULT;
+			}
+		}
+
+		down_write(&private_data->grants_sem);
+		down_write(&private_data->free_list_sem);
+
+		if (unlikely(op.count > private_data->free_list_size)) {
+			rc = -ENOMEM;
+			goto map_out;
+		}
+
+		if (op.count == 1) {
+			if ((rc = add_grant_reference(private_data, op.refs,
+						      &op.index)) < 0) {
+				pr_err("Adding grant reference failed (%d)\n",
+				       rc);
+				goto map_out;
+			}
+		} else {
+			if ((rc = find_contiguous_free_range(private_data,
+							     op.count)) < 0) {
+				pr_err("Finding contiguous range failed"
+				       " (%d)\n", rc);
+				goto map_out;
+			}
+			op.index = rc << PAGE_SHIFT;
+			if ((rc = add_grant_references(private_data, op.count,
+						       refs, rc))) {
+				pr_err("Adding grant references failed (%d)\n",
+				       rc);
+				goto map_out;
+			}
+			compress_free_list(private_data);
+		}
+
+	map_out:
+		up_write(&private_data->free_list_sem);
+		up_write(&private_data->grants_sem);
+
+		kfree(refs);
+
+		if (!rc && copy_to_user((void __user *)arg, &op, sizeof(op)))
+			rc = -EFAULT;
+		return rc;
+	}
+	case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+	{
+		struct ioctl_gntdev_unmap_grant_ref op;
+		uint32_t i, start_index;
+
+		if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
+			return -EFAULT;
+
+		start_index = op.index >> PAGE_SHIFT;
+		if (start_index + op.count > private_data->grants_size)
+			return -EINVAL;
+
+		down_write(&private_data->grants_sem);
+
+		/* First, check that all pages are in the NOT_YET_MAPPED
+		 * state.
+		 */
+		for (i = 0; i < op.count; ++i) {
+			if (unlikely
+			    (private_data->grants[start_index + i].state
+			     != GNTDEV_SLOT_NOT_YET_MAPPED)) {
+				if (private_data->grants[start_index + i].state
+				    == GNTDEV_SLOT_INVALID) {
+					pr_err("Tried to remove an invalid "
+					       "grant at offset 0x%x.",
+					       (start_index + i) 
+					       << PAGE_SHIFT);
+					rc = -EINVAL;
+				} else {
+					pr_err("Tried to remove a grant which "
+					       "is currently mmap()-ed at "
+					       "offset 0x%x.",
+					       (start_index + i) 
+					       << PAGE_SHIFT);
+					rc = -EBUSY;
+				}
+				goto unmap_out;
+			}
+		}
+
+		down_write(&private_data->free_list_sem);
+
+		/* Unmap pages and add them to the free list.
+		 */
+		for (i = 0; i < op.count; ++i) {
+			private_data->grants[start_index+i].state = 
+				GNTDEV_SLOT_INVALID;
+			private_data->grants[start_index+i].u.free_list_index =
+				private_data->free_list_size;
+			private_data->free_list[private_data->free_list_size] =
+				start_index + i;
+			++private_data->free_list_size;
+		}
+
+		up_write(&private_data->free_list_sem);
+	unmap_out:
+		up_write(&private_data->grants_sem);
+		return rc;
+	}
+	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+	{
+		struct ioctl_gntdev_get_offset_for_vaddr op;
+		struct vm_area_struct *vma;
+		unsigned long vaddr;
+
+		if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
+			return -EFAULT;
+
+		vaddr = (unsigned long)op.vaddr;
+
+		down_read(&current->mm->mmap_sem);		
+		vma = find_vma(current->mm, vaddr);
+		if (!vma || vma->vm_ops != &gntdev_vmops) {
+			rc = -EFAULT;
+			goto get_offset_out;
+		}
+		if (vma->vm_start != vaddr) {
+			pr_err("The vaddr specified in an "
+			       "IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR must be at "
+			       "the start of the VM area. vma->vm_start = "
+			       "%#lx; vaddr = %#lx\n",
+			       vma->vm_start, vaddr);
+			rc = -EFAULT;
+			goto get_offset_out;
+		}
+		op.offset = vma->vm_pgoff << PAGE_SHIFT;
+		op.count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	get_offset_out:
+		up_read(&current->mm->mmap_sem);
+		if (!rc && copy_to_user((void __user *)arg, &op, sizeof(op)))
+			rc = -EFAULT;
+		return rc;
+	}
+	case IOCTL_GNTDEV_SET_MAX_GRANTS:
+	{
+		struct ioctl_gntdev_set_max_grants op;
+
+		if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
+			return -EFAULT;
+		if (op.count > MAX_GRANTS_LIMIT)
+			return -EINVAL;
+
+		down_write(&private_data->grants_sem);
+		if (unlikely(private_data->grants))
+			rc = -EBUSY;
+		else
+			rc = init_private_data(private_data, op.count);
+		up_write(&private_data->grants_sem);
+		return rc;
+	}
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	return 0;
+}
diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile
new file mode 100644
index 0000000..2bb2677
--- /dev/null
+++ b/drivers/xen/netback/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
+obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
+
+netbk-y   := netback.o xenbus.o interface.o accel.o
+netloop-y := loopback.o
diff --git a/drivers/xen/netback/accel.c b/drivers/xen/netback/accel.c
new file mode 100644
index 0000000..c902e73
--- /dev/null
+++ b/drivers/xen/netback/accel.c
@@ -0,0 +1,269 @@
+/******************************************************************************
+ * drivers/xen/netback/accel.c
+ *
+ * Interface between backend virtual network device and accelerated plugin. 
+ * 
+ * Copyright (C) 2007 Solarflare Communications, Inc
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <xen/xenbus.h>
+#include <linux/mutex.h>
+
+#include "common.h"
+
+#if 0
+#undef DPRINTK
+#define DPRINTK(fmt, args...)						\
+	printk("netback/accel (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
+#endif
+
+/* 
+ * A list of available netback accelerator plugin modules (each list
+ * entry is of type struct netback_accelerator) 
+ */ 
+static struct list_head accelerators_list;
+/* Lock used to protect access to accelerators_list */
+DEFINE_MUTEX(accelerators_mutex);
+
+/* 
+ * Compare a backend to an accelerator, and decide if they are
+ * compatible (i.e. if the accelerator should be used by the
+ * backend) 
+ */
+static int match_accelerator(struct xenbus_device *xendev,
+			     struct backend_info *be, 
+			     struct netback_accelerator *accelerator)
+{
+	int rc = 0;
+	char *eth_name = xenbus_read(XBT_NIL, xendev->nodename, "accel", NULL);
+	
+	if (IS_ERR(eth_name)) {
+		/* Probably means not present */
+		DPRINTK("%s: no match due to xenbus_read accel error %ld\n",
+			__FUNCTION__, PTR_ERR(eth_name));
+		return 0;
+	} else {
+		if (!strcmp(eth_name, accelerator->eth_name))
+			rc = 1;
+		kfree(eth_name);
+		return rc;
+	}
+}
+
+
+static void do_probe(struct backend_info *be, 
+		     struct netback_accelerator *accelerator,
+		     struct xenbus_device *xendev) 
+{
+	be->accelerator = accelerator;
+	atomic_inc(&be->accelerator->use_count);
+	if (be->accelerator->hooks->probe(xendev) != 0) {
+		atomic_dec(&be->accelerator->use_count);
+		module_put(be->accelerator->hooks->owner);
+		be->accelerator = NULL;
+	}
+}
+
+
+/*
+ * Notify suitable backends that a new accelerator is available and
+ * connected.  This will also notify the accelerator plugin module
+ * that it is being used for a device through the probe hook.
+ */
+static int netback_accelerator_probe_backend(struct device *dev, void *arg)
+{
+	struct netback_accelerator *accelerator = 
+		(struct netback_accelerator *)arg;
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+
+	if (!strcmp("vif", xendev->devicetype)) {
+		struct backend_info *be = dev_get_drvdata(&xendev->dev);
+
+		if (match_accelerator(xendev, be, accelerator) &&
+		    try_module_get(accelerator->hooks->owner)) {
+			do_probe(be, accelerator, xendev);
+		}
+	}
+	return 0;
+}
+
+
+/*
+ * Notify suitable backends that an accelerator is unavailable.
+ */
+static int netback_accelerator_remove_backend(struct device *dev, void *arg)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct netback_accelerator *accelerator = 
+		(struct netback_accelerator *)arg;
+	
+	if (!strcmp("vif", xendev->devicetype)) {
+		struct backend_info *be = dev_get_drvdata(&xendev->dev);
+
+		if (be->accelerator == accelerator) {
+			be->accelerator->hooks->remove(xendev);
+			atomic_dec(&be->accelerator->use_count);
+			module_put(be->accelerator->hooks->owner);
+			be->accelerator = NULL;
+		}
+	}
+	return 0;
+}
+
+
+
+/*
+ * Entry point for an netback accelerator plugin module.  Called to
+ * advertise its presence, and connect to any suitable backends.
+ */
+int netback_connect_accelerator(unsigned version, int id, const char *eth_name, 
+				struct netback_accel_hooks *hooks)
+{
+	struct netback_accelerator *new_accelerator;
+	unsigned eth_name_len;
+
+	if (version != NETBACK_ACCEL_VERSION) {
+		if (version > NETBACK_ACCEL_VERSION) {
+			/* Caller has higher version number, leave it
+			   up to them to decide whether to continue.
+			   They can recall with a lower number if
+			   they're happy to be compatible with us */
+			return NETBACK_ACCEL_VERSION;
+		} else {
+			/* We have a more recent version than caller.
+			   Currently reject, but may in future be able
+			   to be backwardly compatible */
+			return -EPROTO;
+		}
+	}
+
+	new_accelerator = 
+		kmalloc(sizeof(struct netback_accelerator), GFP_KERNEL);
+	if (!new_accelerator) {
+		DPRINTK("%s: failed to allocate memory for accelerator\n",
+			__FUNCTION__);
+		return -ENOMEM;
+	}
+
+	new_accelerator->id = id;
+	
+	eth_name_len = strlen(eth_name)+1;
+	new_accelerator->eth_name = kmalloc(eth_name_len, GFP_KERNEL);
+	if (!new_accelerator->eth_name) {
+		DPRINTK("%s: failed to allocate memory for eth_name string\n",
+			__FUNCTION__);
+		kfree(new_accelerator);
+		return -ENOMEM;
+	}
+	strlcpy(new_accelerator->eth_name, eth_name, eth_name_len);
+	
+	new_accelerator->hooks = hooks;
+
+	atomic_set(&new_accelerator->use_count, 0);
+	
+	mutex_lock(&accelerators_mutex);
+	list_add(&new_accelerator->link, &accelerators_list);
+	
+	/* tell existing backends about new plugin */
+	xenbus_for_each_backend(new_accelerator, 
+				netback_accelerator_probe_backend);
+
+	mutex_unlock(&accelerators_mutex);
+
+	return 0;
+
+}
+EXPORT_SYMBOL_GPL(netback_connect_accelerator);
+
+
+/* 
+ * Disconnect an accelerator plugin module that has previously been
+ * connected.
+ */
+void netback_disconnect_accelerator(int id, const char *eth_name)
+{
+	struct netback_accelerator *accelerator;
+
+	mutex_lock(&accelerators_mutex);
+	list_for_each_entry(accelerator, &accelerators_list, link) {
+		if (!strcmp(eth_name, accelerator->eth_name)) {
+			xenbus_for_each_backend
+				(accelerator, netback_accelerator_remove_backend);
+			BUG_ON(atomic_read(&accelerator->use_count) != 0);
+			list_del(&accelerator->link);				
+			kfree(accelerator->eth_name);
+			kfree(accelerator);
+			break;
+		}
+	}
+	mutex_unlock(&accelerators_mutex);
+}
+EXPORT_SYMBOL_GPL(netback_disconnect_accelerator);
+
+
+void netback_probe_accelerators(struct backend_info *be,
+				struct xenbus_device *dev)
+{
+	struct netback_accelerator *accelerator;
+
+	/* 
+	 * Check list of accelerators to see if any is suitable, and
+	 * use it if it is.
+	 */
+	mutex_lock(&accelerators_mutex);
+	list_for_each_entry(accelerator, &accelerators_list, link) { 
+		if (match_accelerator(dev, be, accelerator) &&
+		    try_module_get(accelerator->hooks->owner)) {
+			do_probe(be, accelerator, dev);
+			break;
+		}
+	}
+	mutex_unlock(&accelerators_mutex);
+}
+
+
+void netback_remove_accelerators(struct backend_info *be,
+				 struct xenbus_device *dev)
+{
+	mutex_lock(&accelerators_mutex);
+	/* Notify the accelerator (if any) of this device's removal */
+	if (be->accelerator != NULL) {
+		be->accelerator->hooks->remove(dev);
+		atomic_dec(&be->accelerator->use_count);
+		module_put(be->accelerator->hooks->owner);
+		be->accelerator = NULL;
+	}
+	mutex_unlock(&accelerators_mutex);
+}
+
+
+void netif_accel_init(void)
+{
+	INIT_LIST_HEAD(&accelerators_list);
+}
diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
new file mode 100644
index 0000000..f681999
--- /dev/null
+++ b/drivers/xen/netback/common.h
@@ -0,0 +1,297 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/common.h
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __NETIF__BACKEND__COMMON_H__
+#define __NETIF__BACKEND__COMMON_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/wait.h>
+#include <xen/interface/io/netif.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+
+#define DPRINTK(_f, _a...)			\
+	pr_debug("(file=%s, line=%d) " _f,	\
+		 __FILE__ , __LINE__ , ## _a )
+#define IPRINTK(fmt, args...) pr_info("xen_net: " fmt, ##args)
+#define WPRINTK(fmt, args...) pr_warning("xen_net: " fmt, ##args)
+
+typedef struct netif_st {
+	/* Unique identifier for this interface. */
+	domid_t          domid;
+	unsigned int     group;
+	unsigned int     handle;
+
+	u8               fe_dev_addr[6];
+
+	unsigned int     irq;
+
+	/* The shared rings and indexes. */
+	netif_tx_back_ring_t tx;
+	netif_rx_back_ring_t rx;
+	struct vm_struct *tx_comms_area;
+	struct vm_struct *rx_comms_area;
+
+	/* Flags that must not be set in dev->features */
+	int features_disabled;
+
+	/* Frontend feature information. */
+	u8 can_sg:1;
+	u8 gso:1;
+	u8 csum:1;
+
+	/* Internal feature information. */
+	u8 can_queue:1;	/* can queue packets for receiver? */
+	u8 copying_receiver:1;	/* copy packets to receiver?       */
+
+	/* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
+	RING_IDX rx_req_cons_peek;
+
+	/* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+	unsigned long   credit_bytes;
+	unsigned long   credit_usec;
+	unsigned long   remaining_credit;
+	struct timer_list credit_timeout;
+
+	/* Enforce draining of the transmit queue. */
+	struct timer_list tx_queue_timeout;
+
+	/* Statistics */
+	unsigned long nr_copied_skbs;
+	unsigned long rx_gso_csum_fixups;
+
+	/* Miscellaneous private stuff. */
+	struct list_head list;  /* scheduling list */
+	atomic_t         refcnt;
+	struct net_device *dev;
+
+	unsigned int carrier;
+
+	wait_queue_head_t waiting_to_free;
+} netif_t;
+
+/*
+ * Implement our own carrier flag: the network stack's version causes delays
+ * when the carrier is re-enabled (in particular, dev_activate() may not
+ * immediately be called, which can cause packet loss; also the etherbridge
+ * can be rather lazy in activating its port).
+ */
+#define netback_carrier_on(netif)	((netif)->carrier = 1)
+#define netback_carrier_off(netif)	((netif)->carrier = 0)
+#define netback_carrier_ok(netif)	((netif)->carrier)
+
+enum {
+	NETBK_DONT_COPY_SKB,
+	NETBK_DELAYED_COPY_SKB,
+	NETBK_ALWAYS_COPY_SKB,
+};
+
+extern int netbk_copy_skb_mode;
+
+/* Function pointers into netback accelerator plugin modules */
+struct netback_accel_hooks {
+	struct module *owner;
+	int  (*probe)(struct xenbus_device *dev);
+	int (*remove)(struct xenbus_device *dev);
+};
+
+/* Structure to track the state of a netback accelerator plugin */
+struct netback_accelerator {
+	struct list_head link;
+	int id;
+	char *eth_name;
+	atomic_t use_count;
+	struct netback_accel_hooks *hooks;
+};
+
+struct backend_info {
+	struct xenbus_device *dev;
+	netif_t *netif;
+	enum xenbus_state frontend_state;
+	struct xenbus_watch hotplug_status_watch;
+	int have_hotplug_status_watch:1;
+
+	/* State relating to the netback accelerator */
+	void *netback_accel_priv;
+	/* The accelerator that this backend is currently using */
+	struct netback_accelerator *accelerator;
+};
+
+#define NETBACK_ACCEL_VERSION 0x00010001
+
+/* 
+ * Connect an accelerator plugin module to netback.  Returns zero on
+ * success, < 0 on error, > 0 (with highest version number supported)
+ * if version mismatch.
+ */
+extern int netback_connect_accelerator(unsigned version,
+				       int id, const char *eth_name, 
+				       struct netback_accel_hooks *hooks);
+/* Disconnect a previously connected accelerator plugin module */
+extern void netback_disconnect_accelerator(int id, const char *eth_name);
+
+
+extern
+void netback_probe_accelerators(struct backend_info *be,
+				struct xenbus_device *dev);
+extern
+void netback_remove_accelerators(struct backend_info *be,
+				 struct xenbus_device *dev);
+extern
+void netif_accel_init(void);
+
+
+#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE)
+#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE)
+
+void netif_disconnect(struct backend_info *be);
+
+netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
+int netif_map(struct backend_info *be, grant_ref_t tx_ring_ref,
+	      grant_ref_t rx_ring_ref, evtchn_port_t evtchn);
+
+#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define netif_put(_b)						\
+	do {							\
+		if ( atomic_dec_and_test(&(_b)->refcnt) )	\
+			wake_up(&(_b)->waiting_to_free);	\
+	} while (0)
+
+void netif_xenbus_init(void);
+
+#define netif_schedulable(netif)				\
+	(netif_running((netif)->dev) && netback_carrier_ok(netif))
+
+void netif_schedule_work(netif_t *netif);
+void netif_deschedule_work(netif_t *netif);
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
+irqreturn_t netif_be_int(int irq, void *dev_id);
+
+static inline int netbk_can_queue(struct net_device *dev)
+{
+	netif_t *netif = netdev_priv(dev);
+	return netif->can_queue;
+}
+
+static inline int netbk_can_sg(struct net_device *dev)
+{
+	netif_t *netif = netdev_priv(dev);
+	return netif->can_sg;
+}
+
+struct pending_tx_info {
+	netif_tx_request_t req;
+	netif_t *netif;
+};
+typedef unsigned int pending_ring_idx_t;
+
+struct netbk_rx_meta {
+	skb_frag_t frag;
+	int id;
+	u8 copy:1;
+};
+
+struct netbk_tx_pending_inuse {
+	struct list_head list;
+	unsigned long alloc_time;
+};
+
+#define MAX_PENDING_REQS (1U << CONFIG_XEN_NETDEV_TX_SHIFT)
+#define MAX_MFN_ALLOC 64
+
+struct xen_netbk {
+	union {
+		struct {
+			struct tasklet_struct net_tx_tasklet;
+			struct tasklet_struct net_rx_tasklet;
+		};
+		struct {
+			wait_queue_head_t netbk_action_wq;
+			struct task_struct *task;
+		};
+	};
+
+	struct sk_buff_head rx_queue;
+	struct sk_buff_head tx_queue;
+
+	struct timer_list net_timer;
+	struct timer_list tx_pending_timer;
+
+	pending_ring_idx_t pending_prod;
+	pending_ring_idx_t pending_cons;
+	pending_ring_idx_t dealloc_prod;
+	pending_ring_idx_t dealloc_cons;
+
+	struct list_head pending_inuse_head;
+	struct list_head schedule_list;
+
+	spinlock_t schedule_list_lock;
+	spinlock_t release_lock;
+
+	struct page **mmap_pages;
+
+	atomic_t nr_groups;
+	unsigned int alloc_index;
+
+	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+	struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
+	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+
+	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
+	u16 pending_ring[MAX_PENDING_REQS];
+	u16 dealloc_ring[MAX_PENDING_REQS];
+
+	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
+	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+	struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
+	struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
+	DECLARE_BITMAP(rx_notify, NR_DYNIRQS);
+#if !defined(NR_DYNIRQS)
+# error
+#elif NR_DYNIRQS <= 0x10000
+	u16 notify_list[NET_RX_RING_SIZE];
+#else
+	int notify_list[NET_RX_RING_SIZE];
+#endif
+	struct netbk_rx_meta meta[NET_RX_RING_SIZE];
+
+	unsigned long mfn_list[MAX_MFN_ALLOC];
+};
+
+extern struct xen_netbk *xen_netbk;
+extern unsigned int netbk_nr_groups;
+
+#endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
new file mode 100644
index 0000000..9ba736b
--- /dev/null
+++ b/drivers/xen/netback/interface.c
@@ -0,0 +1,363 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/interface.c
+ * 
+ * Network-device interface management.
+ * 
+ * Copyright (c) 2004-2005, Keir Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <xen/evtchn.h>
+
+/*
+ * Module parameter 'queue_length':
+ * 
+ * Enables queuing in the network stack when a client has run out of receive
+ * descriptors. Although this feature can improve receive bandwidth by avoiding
+ * packet loss, it can also result in packets sitting in the 'tx_queue' for
+ * unbounded time. This is bad if those packets hold onto foreign resources.
+ * For example, consider a packet that holds onto resources belonging to the
+ * guest for which it is queued (e.g., packet received on vif1.0, destined for
+ * vif1.1 which is not activated in the guest): in this situation the guest
+ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
+ * run a timer (tx_queue_timeout) to drain the queue when the interface is
+ * blocked.
+ */
+static unsigned long netbk_queue_length = 32;
+module_param_named(queue_length, netbk_queue_length, ulong, 0644);
+
+static void __netif_up(netif_t *netif)
+{
+	unsigned int group = 0;
+	unsigned int min_groups = atomic_read(&xen_netbk[0].nr_groups);
+	unsigned int i;
+
+	/* Find the list which contains least number of domains. */
+	for (i = 1; i < netbk_nr_groups; i++) {
+		unsigned int nr_groups = atomic_read(&xen_netbk[i].nr_groups);
+
+		if (nr_groups < min_groups) {
+			group = i;
+			min_groups = nr_groups;
+		}
+	}
+
+	atomic_inc(&xen_netbk[group].nr_groups);
+	netif->group = group;
+
+	enable_irq(netif->irq);
+	netif_schedule_work(netif);
+}
+
+static void __netif_down(netif_t *netif)
+{
+	struct xen_netbk *netbk = xen_netbk + netif->group;
+
+	disable_irq(netif->irq);
+	netif_deschedule_work(netif);
+
+	netif->group = UINT_MAX;
+	atomic_dec(&netbk->nr_groups);
+}
+
+static int net_open(struct net_device *dev)
+{
+	netif_t *netif = netdev_priv(dev);
+	if (netback_carrier_ok(netif)) {
+		__netif_up(netif);
+		netif_start_queue(dev);
+	}
+	return 0;
+}
+
+static int net_close(struct net_device *dev)
+{
+	netif_t *netif = netdev_priv(dev);
+	if (netback_carrier_ok(netif))
+		__netif_down(netif);
+	netif_stop_queue(dev);
+	return 0;
+}
+
+static int netbk_change_mtu(struct net_device *dev, int mtu)
+{
+	int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+	if (mtu > max)
+		return -EINVAL;
+	dev->mtu = mtu;
+	return 0;
+}
+
+static netdev_features_t netbk_fix_features(struct net_device *dev,
+					    netdev_features_t features)
+{
+	netif_t *netif = netdev_priv(dev);
+
+	if (!netif->can_sg)
+		features &= ~NETIF_F_SG;
+	if (!netif->gso)
+		features &= ~NETIF_F_TSO;
+	if (!netif->csum)
+		features &= ~NETIF_F_IP_CSUM;
+
+	return features;
+}
+
+static void netbk_get_drvinfo(struct net_device *dev,
+			      struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, "netbk");
+	strlcpy(info->bus_info, dev_name(dev->dev.parent),
+		ARRAY_SIZE(info->bus_info));
+}
+
+static const struct netif_stat {
+	char name[ETH_GSTRING_LEN];
+	u16 offset;
+} netbk_stats[] = {
+	{ "copied_skbs", offsetof(netif_t, nr_copied_skbs) / sizeof(long) },
+	{ "rx_gso_csum_fixups", offsetof(netif_t, rx_gso_csum_fixups) / sizeof(long) },
+};
+
+static int netbk_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(netbk_stats);
+	}
+	return -EOPNOTSUPP;
+}
+
+static void netbk_get_ethtool_stats(struct net_device *dev,
+				   struct ethtool_stats *stats, u64 * data)
+{
+	unsigned long *np = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
+		data[i] = np[netbk_stats[i].offset];
+}
+
+static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
+{
+	int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
+			memcpy(data + i * ETH_GSTRING_LEN,
+			       netbk_stats[i].name, ETH_GSTRING_LEN);
+		break;
+	}
+}
+
+static const struct ethtool_ops network_ethtool_ops =
+{
+	.get_drvinfo = netbk_get_drvinfo,
+	.get_link = ethtool_op_get_link,
+
+	.get_sset_count = netbk_get_sset_count,
+	.get_ethtool_stats = netbk_get_ethtool_stats,
+	.get_strings = netbk_get_strings,
+};
+
+static const struct net_device_ops netif_be_netdev_ops = {
+	.ndo_open               = net_open,
+	.ndo_stop               = net_close,
+	.ndo_start_xmit         = netif_be_start_xmit,
+	.ndo_change_mtu	        = netbk_change_mtu,
+	.ndo_fix_features       = netbk_fix_features,
+	.ndo_set_mac_address    = eth_mac_addr,
+	.ndo_validate_addr      = eth_validate_addr,
+};
+
+netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
+{
+	int err = 0;
+	struct net_device *dev;
+	netif_t *netif;
+	char name[IFNAMSIZ] = {};
+
+	snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
+	dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
+	if (dev == NULL) {
+		DPRINTK("Could not create netif: out of memory\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	SET_NETDEV_DEV(dev, parent);
+
+	netif = netdev_priv(dev);
+	netif->domid  = domid;
+	netif->group = UINT_MAX;
+	netif->handle = handle;
+	netif->can_sg = 1;
+	netif->csum = 1;
+	atomic_set(&netif->refcnt, 1);
+	init_waitqueue_head(&netif->waiting_to_free);
+	netif->dev = dev;
+
+	netback_carrier_off(netif);
+
+	netif->credit_bytes = netif->remaining_credit = ~0UL;
+	netif->credit_usec  = 0UL;
+	init_timer(&netif->credit_timeout);
+	/* Initialize 'expires' now: it's used to track the credit window. */
+	netif->credit_timeout.expires = jiffies;
+
+	init_timer(&netif->tx_queue_timeout);
+
+	dev->netdev_ops = &netif_be_netdev_ops;
+
+	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
+	dev->features = dev->hw_features;
+
+	SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
+
+	dev->tx_queue_len = netbk_queue_length;
+
+	/*
+	 * Initialise a dummy MAC address. We choose the numerically
+	 * largest non-broadcast address to prevent the address getting
+	 * stolen by an Ethernet bridge for STP purposes.
+	 * (FE:FF:FF:FF:FF:FF)
+	 */ 
+	memset(dev->dev_addr, 0xFF, ETH_ALEN);
+	dev->dev_addr[0] &= ~0x01;
+
+	rtnl_lock();
+	err = register_netdevice(dev);
+	rtnl_unlock();
+	if (err) {
+		DPRINTK("Could not register new net device %s: err=%d\n",
+			dev->name, err);
+		free_netdev(dev);
+		return ERR_PTR(err);
+	}
+
+	DPRINTK("Successfully created netif\n");
+	return netif;
+}
+
+int netif_map(struct backend_info *be, grant_ref_t tx_ring_ref,
+	      grant_ref_t rx_ring_ref, evtchn_port_t evtchn)
+{
+	netif_t *netif = be->netif;
+	struct vm_struct *area;
+	int err = -ENOMEM;
+	netif_tx_sring_t *txs;
+	netif_rx_sring_t *rxs;
+
+	/* Already connected through? */
+	if (netif->irq)
+		return 0;
+
+	area = xenbus_map_ring_valloc(be->dev, tx_ring_ref);
+	if (IS_ERR(area))
+		return PTR_ERR(area);
+	netif->tx_comms_area = area;
+	area = xenbus_map_ring_valloc(be->dev, rx_ring_ref);
+	if (IS_ERR(area)) {
+		err = PTR_ERR(area);
+		goto err_rx;
+	}
+	netif->rx_comms_area = area;
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		netif->domid, evtchn, netif_be_int, 0,
+		netif->dev->name, netif);
+	if (err < 0)
+		goto err_hypervisor;
+	BUG_ON(err < DYNIRQ_BASE || err >= DYNIRQ_BASE + NR_DYNIRQS);
+	netif->irq = err;
+	disable_irq(netif->irq);
+
+	txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
+	BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
+
+	rxs = (netif_rx_sring_t *)
+		((char *)netif->rx_comms_area->addr);
+	BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
+
+	netif->rx_req_cons_peek = 0;
+
+	netif_get(netif);
+
+	rtnl_lock();
+	if (!netif->can_sg && netif->dev->mtu > ETH_DATA_LEN)
+		dev_set_mtu(netif->dev, ETH_DATA_LEN);
+	netdev_update_features(netif->dev);
+	netback_carrier_on(netif);
+	if (netif_running(netif->dev))
+		__netif_up(netif);
+	rtnl_unlock();
+
+	return 0;
+err_hypervisor:
+	xenbus_unmap_ring_vfree(be->dev, netif->rx_comms_area);
+err_rx:
+	xenbus_unmap_ring_vfree(be->dev, netif->tx_comms_area);
+	return err;
+}
+
+void netif_disconnect(struct backend_info *be)
+{
+	netif_t *netif = be->netif;
+
+	if (netback_carrier_ok(netif)) {
+		rtnl_lock();
+		netback_carrier_off(netif);
+		netif_carrier_off(netif->dev); /* discard queued packets */
+		if (netif_running(netif->dev))
+			__netif_down(netif);
+		rtnl_unlock();
+		netif_put(netif);
+	}
+
+	atomic_dec(&netif->refcnt);
+	wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
+
+	del_timer_sync(&netif->credit_timeout);
+	del_timer_sync(&netif->tx_queue_timeout);
+
+	if (netif->irq)
+		unbind_from_irqhandler(netif->irq, netif);
+	
+	unregister_netdev(netif->dev);
+
+	if (netif->tx.sring) {
+		xenbus_unmap_ring_vfree(be->dev, netif->tx_comms_area);
+		xenbus_unmap_ring_vfree(be->dev, netif->rx_comms_area);
+	}
+
+	free_netdev(netif->dev);
+}
diff --git a/drivers/xen/netback/loopback.c b/drivers/xen/netback/loopback.c
new file mode 100644
index 0000000..1020d2f
--- /dev/null
+++ b/drivers/xen/netback/loopback.c
@@ -0,0 +1,278 @@
+/******************************************************************************
+ * netback/loopback.c
+ * 
+ * A two-interface loopback device to emulate a local netfront-netback
+ * connection. This ensures that local packet delivery looks identical
+ * to inter-domain delivery. Most importantly, packets delivered locally
+ * originating from other domains will get *copied* when they traverse this
+ * driver. This prevents unbounded delays in socket-buffer queues from
+ * causing the netback driver to "seize up".
+ * 
+ * This driver creates a symmetric pair of loopback interfaces with names
+ * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
+ * bridge, just like a proper netback interface, while a local IP interface
+ * is configured on 'veth0'.
+ * 
+ * As with a real netback interface, vif0.0 is configured with a suitable
+ * dummy MAC address. No default is provided for veth0: a reasonable strategy
+ * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
+ * (to avoid confusing the Etherbridge).
+ * 
+ * Copyright (c) 2005 K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/ethtool.h>
+#include <net/dst.h>
+#include <net/xfrm.h>		/* secpath_reset() */
+#include <asm/hypervisor.h>	/* is_initial_xendomain() */
+#include <../net/core/kmap_skb.h> /* k{,un}map_skb_frag() */
+
+static int nloopbacks = -1;
+module_param(nloopbacks, int, 0);
+MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
+
+struct net_private {
+	struct net_device *loopback_dev;
+	int loop_idx;
+};
+
+static inline struct net_private *loopback_priv(struct net_device *dev)
+{
+	return netdev_priv(dev);
+}
+
+static int loopback_open(struct net_device *dev)
+{
+	memset(&dev->stats, 0, sizeof(dev->stats));
+	netif_start_queue(dev);
+	return 0;
+}
+
+static int loopback_close(struct net_device *dev)
+{
+	netif_stop_queue(dev);
+	return 0;
+}
+
+#ifdef CONFIG_X86
+static int is_foreign(unsigned long pfn)
+{
+	/* NB. Play it safe for auto-translation mode. */
+	return (xen_feature(XENFEAT_auto_translated_physmap) ||
+		(phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT));
+}
+#else
+/* How to detect a foreign mapping? Play it safe. */
+#define is_foreign(pfn)	(1)
+#endif
+
+static int skb_remove_foreign_references(struct sk_buff *skb)
+{
+	struct page *page;
+	unsigned long pfn;
+	int i, off;
+	char *vaddr;
+
+	BUG_ON(skb_shinfo(skb)->frag_list);
+
+	if (skb_cloned(skb) &&
+	    unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+		return 0;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		pfn = page_to_pfn(skb_frag_page(&skb_shinfo(skb)->frags[i]));
+		if (!is_foreign(pfn))
+			continue;
+		
+		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+		if (unlikely(!page))
+			return 0;
+
+		vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+		off = skb_shinfo(skb)->frags[i].page_offset;
+		memcpy(page_address(page) + off,
+		       vaddr + off,
+		       skb_frag_size(&skb_shinfo(skb)->frags[i]));
+		kunmap_skb_frag(vaddr);
+
+		skb_frag_unref(skb, i);
+		skb_frag_set_page(skb, i, page);
+	}
+
+	return 1;
+}
+
+static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	if (!skb_remove_foreign_references(skb)) {
+		dev->stats.tx_dropped++;
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	dst_release(skb_dst(skb));
+	skb_dst_set(skb, NULL);
+
+	skb_orphan(skb);
+
+	dev->stats.tx_bytes += skb->len;
+	dev->stats.tx_packets++;
+
+	/* Switch to loopback context. */
+	dev = loopback_priv(dev)->loopback_dev;
+
+	dev->stats.rx_bytes += skb->len;
+	dev->stats.rx_packets++;
+
+	skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
+	skb->protocol = eth_type_trans(skb, dev);
+
+	/* Flush netfilter context: rx'ed skbuffs not expected to have any. */
+	nf_reset(skb);
+	secpath_reset(skb);
+
+	netif_rx(skb);
+
+	return NETDEV_TX_OK;
+}
+
+static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, "netloop");
+	snprintf(info->bus_info, ETHTOOL_BUSINFO_LEN, "vif-0-%d",
+		 loopback_priv(dev)->loop_idx);
+}
+
+static const struct ethtool_ops network_ethtool_ops =
+{
+	.get_drvinfo = get_drvinfo,
+	.get_link = ethtool_op_get_link,
+};
+
+static const struct net_device_ops loopback_netdev_ops = {
+	.ndo_open               = loopback_open,
+	.ndo_stop               = loopback_close,
+	.ndo_start_xmit         = loopback_start_xmit,
+	.ndo_change_mtu	        = NULL, /* allow arbitrary mtu */
+};
+
+static void loopback_construct(struct net_device *dev, struct net_device *lo,
+			       int loop_idx)
+{
+	struct net_private *np = loopback_priv(dev);
+
+	np->loopback_dev     = lo;
+	np->loop_idx         = loop_idx;
+
+	dev->netdev_ops      = &loopback_netdev_ops;
+	dev->tx_queue_len    = 0;
+
+	dev->features        = (NETIF_F_HIGHDMA |
+				NETIF_F_LLTX |
+				NETIF_F_TSO |
+				NETIF_F_SG |
+				NETIF_F_IP_CSUM);
+
+	SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
+
+	/*
+	 * We do not set a jumbo MTU on the interface. Otherwise the network
+	 * stack will try to send large packets that will get dropped by the
+	 * Ethernet bridge (unless the physical Ethernet interface is
+	 * configured to transfer jumbo packets). If a larger MTU is desired
+	 * then the system administrator can specify it using the 'ifconfig'
+	 * command.
+	 */
+	/*dev->mtu             = 16*1024;*/
+}
+
+static int __init make_loopback(int i)
+{
+	struct net_device *dev1, *dev2;
+	char dev_name[IFNAMSIZ];
+	int err = -ENOMEM;
+
+	sprintf(dev_name, "vif0.%d", i);
+	dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
+	if (!dev1)
+		return err;
+
+	sprintf(dev_name, "veth%d", i);
+	dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
+	if (!dev2)
+		goto fail_netdev2;
+
+	loopback_construct(dev1, dev2, i);
+	loopback_construct(dev2, dev1, i);
+
+	/*
+	 * Initialise a dummy MAC address for the 'dummy backend' interface. We
+	 * choose the numerically largest non-broadcast address to prevent the
+	 * address getting stolen by an Ethernet bridge for STP purposes.
+	 */
+	memset(dev1->dev_addr, 0xFF, ETH_ALEN);
+	dev1->dev_addr[0] &= ~0x01;
+
+	if ((err = register_netdev(dev1)) != 0)
+		goto fail;
+
+	if ((err = register_netdev(dev2)) != 0) {
+		unregister_netdev(dev1);
+		goto fail;
+	}
+
+	return 0;
+
+ fail:
+	free_netdev(dev2);
+ fail_netdev2:
+	free_netdev(dev1);
+	return err;
+}
+
+static int __init loopback_init(void)
+{
+	int i, err = 0;
+
+	if (nloopbacks == -1)
+		nloopbacks = is_initial_xendomain() ? 4 : 0;
+
+	for (i = 0; i < nloopbacks; i++)
+		if ((err = make_loopback(i)) != 0)
+			break;
+
+	return err;
+}
+
+module_init(loopback_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
new file mode 100644
index 0000000..7fbf29a
--- /dev/null
+++ b/drivers/xen/netback/netback.c
@@ -0,0 +1,1885 @@
+/******************************************************************************
+ * drivers/xen/netback/netback.c
+ * 
+ * Back-end of the driver for virtual network devices. This portion of the
+ * driver exports a 'unified' network-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  drivers/xen/netfront/netfront.c
+ * 
+ * Copyright (c) 2002-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <linux/if_vlan.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <net/tcp.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/interface/memory.h>
+#include <xen/net-util.h>
+
+/*define NETBE_DEBUG_INTERRUPT*/
+
+struct xen_netbk *__read_mostly xen_netbk;
+unsigned int __read_mostly netbk_nr_groups;
+static bool __read_mostly use_kthreads = true;
+static bool __initdata bind_threads;
+
+#define GET_GROUP_INDEX(netif) ((netif)->group)
+
+static void netif_idx_release(struct xen_netbk *, u16 pending_idx);
+static void make_tx_response(netif_t *netif, 
+			     netif_tx_request_t *txp,
+			     s8       st);
+static netif_rx_response_t *make_rx_response(netif_t *netif, 
+					     u16      id, 
+					     s8       st,
+					     u16      offset,
+					     u16      size,
+					     u16      flags);
+
+static void net_tx_action(unsigned long group);
+static void net_rx_action(unsigned long group);
+
+/* Discriminate from any valid pending_idx value. */
+#define INVALID_PENDING_IDX 0xffff
+
+static inline unsigned long idx_to_pfn(struct xen_netbk *netbk, u16 idx)
+{
+	return page_to_pfn(netbk->mmap_pages[idx]);
+}
+
+static inline unsigned long idx_to_kaddr(struct xen_netbk *netbk, u16 idx)
+{
+	return (unsigned long)pfn_to_kaddr(idx_to_pfn(netbk, idx));
+}
+
+/* extra field used in struct page */
+union page_ext {
+	struct {
+#if BITS_PER_LONG < 64
+#define GROUP_WIDTH (BITS_PER_LONG - CONFIG_XEN_NETDEV_TX_SHIFT)
+#define MAX_GROUPS ((1U << GROUP_WIDTH) - 1)
+		unsigned int grp:GROUP_WIDTH;
+		unsigned int idx:CONFIG_XEN_NETDEV_TX_SHIFT;
+#else
+#define MAX_GROUPS UINT_MAX
+		unsigned int grp, idx;
+#endif
+	} e;
+	void *mapping;
+};
+
+static inline void netif_set_page_ext(struct page *pg, unsigned int group,
+				      unsigned int idx)
+{
+	union page_ext ext = { .e = { .grp = group + 1, .idx = idx } };
+
+	BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping));
+	pg->mapping = ext.mapping;
+}
+
+static inline unsigned int netif_page_group(const struct page *pg)
+{
+	union page_ext ext = { .mapping = pg->mapping };
+
+	return ext.e.grp - 1;
+}
+
+static inline unsigned int netif_page_index(const struct page *pg)
+{
+	union page_ext ext = { .mapping = pg->mapping };
+
+	return ext.e.idx;
+}
+
+static u16 frag_get_pending_idx(const skb_frag_t *frag)
+{
+	return (u16)frag->page_offset;
+}
+
+static void frag_set_pending_idx(skb_frag_t *frag, u16 pending_idx)
+{
+	frag->page_offset = pending_idx;
+}
+
+/*
+ * This is the amount of packet we copy rather than map, so that the
+ * guest can't fiddle with the contents of the headers while we do
+ * packet processing on them (netfilter, routing, etc).
+ */
+#define PKT_PROT_LEN    (ETH_HLEN + VLAN_HLEN + \
+			 sizeof(struct iphdr) + MAX_IPOPTLEN + \
+			 sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
+
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+
+static inline pending_ring_idx_t nr_pending_reqs(const struct xen_netbk *netbk)
+{
+	return MAX_PENDING_REQS -
+		netbk->pending_prod + netbk->pending_cons;
+}
+
+/* Setting this allows the safe use of this driver without netloop. */
+static bool MODPARM_copy_skb = true;
+module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
+MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
+static bool MODPARM_permute_returns;
+module_param_named(permute_returns, MODPARM_permute_returns, bool, S_IRUSR|S_IWUSR);
+MODULE_PARM_DESC(permute_returns, "Randomly permute the order in which TX responses are sent to the frontend");
+module_param_named(groups, netbk_nr_groups, uint, 0);
+MODULE_PARM_DESC(groups, "Specify the number of tasklet pairs/threads to use");
+module_param_named(tasklets, use_kthreads, invbool, 0);
+MODULE_PARM_DESC(tasklets, "Use tasklets instead of kernel threads");
+module_param_named(bind, bind_threads, bool, 0);
+MODULE_PARM_DESC(bind, "Bind kernel threads to (v)CPUs");
+
+int netbk_copy_skb_mode;
+
+static inline unsigned long alloc_mfn(struct xen_netbk *netbk)
+{
+	BUG_ON(netbk->alloc_index == 0);
+	return netbk->mfn_list[--netbk->alloc_index];
+}
+
+static int check_mfn(struct xen_netbk *netbk, unsigned int nr)
+{
+	struct xen_memory_reservation reservation = {
+		.extent_order = 0,
+		.domid        = DOMID_SELF
+	};
+	int rc;
+
+	if (likely(netbk->alloc_index >= nr))
+		return 0;
+
+	set_xen_guest_handle(reservation.extent_start,
+			     netbk->mfn_list + netbk->alloc_index);
+	reservation.nr_extents = MAX_MFN_ALLOC - netbk->alloc_index;
+	rc = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation);
+	if (likely(rc > 0))
+		netbk->alloc_index += rc;
+
+	return netbk->alloc_index >= nr ? 0 : -ENOMEM;
+}
+
+static void netbk_schedule(struct xen_netbk *netbk)
+{
+	if (use_kthreads)
+		wake_up(&netbk->netbk_action_wq);
+	else
+		tasklet_schedule(&netbk->net_tx_tasklet);
+}
+
+static void netbk_schedule_group(unsigned long group)
+{
+	netbk_schedule(&xen_netbk[group]);
+}
+
+static inline void maybe_schedule_tx_action(unsigned int group)
+{
+	struct xen_netbk *netbk = &xen_netbk[group];
+
+	smp_mb();
+	if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) &&
+	    !list_empty(&netbk->schedule_list))
+		netbk_schedule(netbk);
+}
+
+static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
+{
+	struct skb_shared_info *ninfo;
+	struct sk_buff *nskb;
+	unsigned long offset;
+	int ret;
+	int len;
+	int headlen;
+
+	BUG_ON(skb_shinfo(skb)->frag_list != NULL);
+
+	nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!nskb))
+		goto err;
+
+	skb_reserve(nskb, 16 + NET_IP_ALIGN);
+	headlen = skb_end_pointer(nskb) - nskb->data;
+	if (headlen > skb_headlen(skb))
+		headlen = skb_headlen(skb);
+	ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
+	BUG_ON(ret);
+
+	ninfo = skb_shinfo(nskb);
+	ninfo->gso_size = skb_shinfo(skb)->gso_size;
+	ninfo->gso_type = skb_shinfo(skb)->gso_type;
+
+	offset = headlen;
+	len = skb->len - headlen;
+
+	nskb->len = skb->len;
+	nskb->data_len = len;
+	nskb->truesize += len;
+
+	while (len) {
+		struct page *page;
+		int copy;
+		int zero;
+
+		if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
+			dump_stack();
+			goto err_free;
+		}
+
+		copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
+		zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
+
+		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
+		if (unlikely(!page))
+			goto err_free;
+
+		ret = skb_copy_bits(skb, offset, page_address(page), copy);
+		BUG_ON(ret);
+
+		__skb_fill_page_desc(nskb, ninfo->nr_frags, page, 0, copy);
+		ninfo->nr_frags++;
+
+		offset += copy;
+		len -= copy;
+	}
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	offset = 0;
+#else
+	offset = nskb->data - skb->data;
+#endif
+
+	nskb->transport_header = skb->transport_header + offset;
+	nskb->network_header   = skb->network_header   + offset;
+	nskb->mac_header       = skb->mac_header       + offset;
+
+	return nskb;
+
+ err_free:
+	kfree_skb(nskb);
+ err:
+	return NULL;
+}
+
+static inline int netbk_max_required_rx_slots(netif_t *netif)
+{
+	if (netif->can_sg || netif->gso)
+		return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
+	return 1; /* all in one */
+}
+
+static inline int netbk_queue_full(netif_t *netif)
+{
+	RING_IDX peek   = netif->rx_req_cons_peek;
+	RING_IDX needed = netbk_max_required_rx_slots(netif);
+
+	return ((netif->rx.sring->req_prod - peek) < needed) ||
+	       ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
+}
+
+static void tx_queue_callback(unsigned long data)
+{
+	netif_t *netif = (netif_t *)data;
+	if (netif_schedulable(netif))
+		netif_wake_queue(netif->dev);
+}
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	netif_t *netif = netdev_priv(dev);
+	unsigned int group = GET_GROUP_INDEX(netif);
+	struct xen_netbk *netbk;
+
+	BUG_ON(skb->dev != dev);
+
+	if (unlikely(group >= netbk_nr_groups)) {
+		BUG_ON(group != UINT_MAX);
+		goto drop;
+	}
+
+	/* Drop the packet if the target domain has no receive buffers. */
+	if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
+		goto drop;
+
+	/*
+	 * Copy the packet here if it's destined for a flipping interface
+	 * but isn't flippable (e.g. extra references to data).
+	 * XXX For now we also copy skbuffs whose head crosses a page
+	 * boundary, because netbk_gop_skb can't handle them.
+	 */
+	if (!netif->copying_receiver ||
+	    ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE)) {
+		struct sk_buff *nskb = netbk_copy_skb(skb);
+		if ( unlikely(nskb == NULL) )
+			goto drop;
+		/* Copy only the header fields we use in this driver. */
+		nskb->dev = skb->dev;
+		nskb->ip_summed = skb->ip_summed;
+		dev_kfree_skb(skb);
+		skb = nskb;
+	}
+
+	netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
+				   !!skb_shinfo(skb)->gso_size;
+	netif_get(netif);
+
+	if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
+		netif->rx.sring->req_event = netif->rx_req_cons_peek +
+			netbk_max_required_rx_slots(netif);
+		mb(); /* request notification /then/ check & stop the queue */
+		if (netbk_queue_full(netif)) {
+			netif_stop_queue(dev);
+			/*
+			 * Schedule 500ms timeout to restart the queue, thus
+			 * ensuring that an inactive queue will be drained.
+			 * Packets will be immediately be dropped until more
+			 * receive buffers become available (see
+			 * netbk_queue_full() check above).
+			 */
+			netif->tx_queue_timeout.data = (unsigned long)netif;
+			netif->tx_queue_timeout.function = tx_queue_callback;
+			mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
+		}
+	}
+
+	netbk = &xen_netbk[group];
+	skb_queue_tail(&netbk->rx_queue, skb);
+	netbk_schedule(netbk);
+
+	return NETDEV_TX_OK;
+
+ drop:
+	dev->stats.tx_dropped++;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+#if 0
+static void xen_network_done_notify(void)
+{
+	static struct net_device *eth0_dev = NULL;
+	if (unlikely(eth0_dev == NULL))
+		eth0_dev = __dev_get_by_name(&init_net, "eth0");
+	napi_schedule(???);
+}
+/* 
+ * Add following to poll() function in NAPI driver (Tigon3 is example):
+ *  if ( xen_network_done() )
+ *      tg3_enable_ints(tp);
+ */
+int xen_network_done(void)
+{
+	return skb_queue_empty(&rx_queue);
+}
+#endif
+
+struct netrx_pending_operations {
+	unsigned trans_prod, trans_cons;
+	unsigned mmu_prod, mmu_mcl;
+	unsigned mcl_prod, mcl_cons;
+	unsigned copy_prod, copy_cons;
+	unsigned meta_prod, meta_cons;
+	mmu_update_t *mmu;
+	gnttab_transfer_t *trans;
+	gnttab_copy_t *copy;
+	multicall_entry_t *mcl;
+	struct netbk_rx_meta *meta;
+};
+
+/* Set up the grant operations for this fragment.  If it's a flipping
+   interface, we also set up the unmap request from here. */
+static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
+			  int i, struct netrx_pending_operations *npo,
+			  struct page *page, unsigned long size,
+			  unsigned long offset)
+{
+	mmu_update_t *mmu;
+	gnttab_transfer_t *gop;
+	gnttab_copy_t *copy_gop;
+	multicall_entry_t *mcl;
+	netif_rx_request_t *req;
+	unsigned long old_mfn, new_mfn;
+	struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
+
+	old_mfn = virt_to_mfn(page_address(page));
+
+	req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
+	if (netif->copying_receiver) {
+		unsigned int group, idx;
+
+		/* The fragment needs to be copied rather than
+		   flipped. */
+		meta->copy = 1;
+		copy_gop = npo->copy + npo->copy_prod++;
+		copy_gop->flags = GNTCOPY_dest_gref;
+		if (PageForeign(page) &&
+		    page->mapping != NULL &&
+		    (idx = netif_page_index(page)) < MAX_PENDING_REQS &&
+		    (group = netif_page_group(page)) < netbk_nr_groups) {
+			struct pending_tx_info *src_pend;
+			unsigned int grp;
+
+			netbk = &xen_netbk[group];
+			BUG_ON(netbk->mmap_pages[idx] != page);
+			src_pend = &netbk->pending_tx_info[idx];
+			grp = GET_GROUP_INDEX(src_pend->netif);
+			BUG_ON(group != grp && grp != UINT_MAX);
+			copy_gop->source.domid = src_pend->netif->domid;
+			copy_gop->source.u.ref = src_pend->req.gref;
+			copy_gop->flags |= GNTCOPY_source_gref;
+		} else {
+			copy_gop->source.domid = DOMID_SELF;
+			copy_gop->source.u.gmfn = old_mfn;
+		}
+		copy_gop->source.offset = offset;
+		copy_gop->dest.domid = netif->domid;
+		copy_gop->dest.offset = 0;
+		copy_gop->dest.u.ref = req->gref;
+		copy_gop->len = size;
+	} else {
+		meta->copy = 0;
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			new_mfn = alloc_mfn(netbk);
+
+			/*
+			 * Set the new P2M table entry before
+			 * reassigning the old data page. Heed the
+			 * comment in pgtable-2level.h:pte_page(). :-)
+			 */
+			set_phys_to_machine(page_to_pfn(page), new_mfn);
+
+			mcl = npo->mcl + npo->mcl_prod++;
+			MULTI_update_va_mapping(mcl,
+					     (unsigned long)page_address(page),
+					     pfn_pte_ma(new_mfn, PAGE_KERNEL),
+					     0);
+
+			mmu = npo->mmu + npo->mmu_prod++;
+			mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
+				MMU_MACHPHYS_UPDATE;
+			mmu->val = page_to_pfn(page);
+		}
+
+		gop = npo->trans + npo->trans_prod++;
+		gop->mfn = old_mfn;
+		gop->domid = netif->domid;
+		gop->ref = req->gref;
+	}
+	return req->id;
+}
+
+static void netbk_gop_skb(struct sk_buff *skb,
+			  struct netrx_pending_operations *npo)
+{
+	netif_t *netif = netdev_priv(skb->dev);
+	int nr_frags = skb_shinfo(skb)->nr_frags;
+	int i;
+	int extra;
+	struct netbk_rx_meta *head_meta, *meta;
+
+	head_meta = npo->meta + npo->meta_prod++;
+	head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
+	head_meta->frag.size = skb_shinfo(skb)->gso_size;
+	extra = !!head_meta->frag.size + 1;
+
+	for (i = 0; i < nr_frags; i++) {
+		meta = npo->meta + npo->meta_prod++;
+		meta->frag = skb_shinfo(skb)->frags[i];
+		meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
+					  skb_frag_page(&meta->frag),
+					  skb_frag_size(&meta->frag),
+					  meta->frag.page_offset);
+	}
+
+	/*
+	 * This must occur at the end to ensure that we don't trash skb_shinfo
+	 * until we're done. We know that the head doesn't cross a page
+	 * boundary because such packets get copied in netif_be_start_xmit.
+	 */
+	head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
+				       virt_to_page(skb->data),
+				       skb_headlen(skb),
+				       offset_in_page(skb->data));
+
+	netif->rx.req_cons += nr_frags + extra;
+}
+
+static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
+{
+	int i;
+
+	for (i = 0; i < nr_frags; i++)
+		put_page(skb_frag_page(&meta[i].frag));
+}
+
+/* This is a twin to netbk_gop_skb.  Assume that netbk_gop_skb was
+   used to set up the operations on the top of
+   netrx_pending_operations, which have since been done.  Check that
+   they didn't give any errors and advance over them. */
+static int netbk_check_gop(int nr_frags, domid_t domid, struct netrx_pending_operations *npo)
+{
+	multicall_entry_t *mcl;
+	gnttab_transfer_t *gop;
+	gnttab_copy_t     *copy_op;
+	int status = XEN_NETIF_RSP_OKAY;
+	int i;
+
+	for (i = 0; i <= nr_frags; i++) {
+		if (npo->meta[npo->meta_cons + i].copy) {
+			copy_op = npo->copy + npo->copy_cons++;
+			if (unlikely(copy_op->status == GNTST_eagain))
+				gnttab_check_GNTST_eagain_while(GNTTABOP_copy, copy_op);
+			if (unlikely(copy_op->status != GNTST_okay)) {
+				DPRINTK("Bad status %d from copy to DOM%d.\n",
+					copy_op->status, domid);
+				status = XEN_NETIF_RSP_ERROR;
+			}
+		} else {
+			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+				mcl = npo->mcl + npo->mcl_cons++;
+				/* The update_va_mapping() must not fail. */
+				BUG_ON(mcl->result != 0);
+			}
+
+			gop = npo->trans + npo->trans_cons++;
+			/* Check the reassignment error code. */
+			if (unlikely(gop->status != GNTST_okay)) {
+				DPRINTK("Bad status %d from grant transfer to DOM%u\n",
+					gop->status, domid);
+				/*
+				 * Page no longer belongs to us unless
+				 * GNTST_bad_page, but that should be
+				 * a fatal error anyway.
+				 */
+				BUG_ON(gop->status == GNTST_bad_page);
+				status = XEN_NETIF_RSP_ERROR;
+			}
+		}
+	}
+
+	return status;
+}
+
+static void netbk_add_frag_responses(netif_t *netif, int status,
+				     struct netbk_rx_meta *meta, int nr_frags)
+{
+	int i;
+	unsigned long offset;
+
+	for (i = 0; i < nr_frags; i++) {
+		int id = meta[i].id;
+		int flags = (i == nr_frags - 1) ? 0 : XEN_NETRXF_more_data;
+
+		if (meta[i].copy)
+			offset = 0;
+		else
+			offset = meta[i].frag.page_offset;
+		make_rx_response(netif, id, status, offset,
+				 meta[i].frag.size, flags);
+	}
+}
+
+static void net_rx_action(unsigned long group)
+{
+	netif_t *netif = NULL;
+	s8 status;
+	u16 id, irq, flags;
+	netif_rx_response_t *resp;
+	multicall_entry_t *mcl;
+	struct sk_buff_head rxq;
+	struct sk_buff *skb;
+	int notify_nr = 0;
+	int ret;
+	int nr_frags;
+	int count;
+	unsigned long offset;
+	struct xen_netbk *netbk = &xen_netbk[group];
+
+	struct netrx_pending_operations npo = {
+		.mmu   = netbk->rx_mmu,
+		.trans = netbk->grant_trans_op,
+		.copy  = netbk->grant_copy_op,
+		.mcl   = netbk->rx_mcl,
+		.meta  = netbk->meta,
+	};
+
+	skb_queue_head_init(&rxq);
+
+	count = 0;
+
+	while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) {
+		nr_frags = skb_shinfo(skb)->nr_frags;
+		*(int *)skb->cb = nr_frags;
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap) &&
+		    !((netif_t *)netdev_priv(skb->dev))->copying_receiver &&
+		    check_mfn(netbk, nr_frags + 1)) {
+			/* Memory squeeze? Back off for an arbitrary while. */
+			if ( net_ratelimit() )
+				WPRINTK("Memory squeeze in netback "
+					"driver.\n");
+			mod_timer(&netbk->net_timer, jiffies + HZ);
+			skb_queue_head(&netbk->rx_queue, skb);
+			break;
+		}
+
+		netbk_gop_skb(skb, &npo);
+
+		count += nr_frags + 1;
+
+		__skb_queue_tail(&rxq, skb);
+
+		/* Filled the batch queue? */
+		if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
+			break;
+	}
+
+	BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta));
+
+	npo.mmu_mcl = npo.mcl_prod;
+	if (npo.mcl_prod) {
+		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+		BUG_ON(npo.mmu_prod > ARRAY_SIZE(netbk->rx_mmu));
+		mcl = npo.mcl + npo.mcl_prod++;
+
+		BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
+		mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
+
+		mcl->op = __HYPERVISOR_mmu_update;
+		mcl->args[0] = (unsigned long)netbk->rx_mmu;
+		mcl->args[1] = npo.mmu_prod;
+		mcl->args[2] = 0;
+		mcl->args[3] = DOMID_SELF;
+	}
+
+	if (npo.trans_prod) {
+		BUG_ON(npo.trans_prod > ARRAY_SIZE(netbk->grant_trans_op));
+		mcl = npo.mcl + npo.mcl_prod++;
+		mcl->op = __HYPERVISOR_grant_table_op;
+		mcl->args[0] = GNTTABOP_transfer;
+		mcl->args[1] = (unsigned long)netbk->grant_trans_op;
+		mcl->args[2] = npo.trans_prod;
+	}
+
+	if (npo.copy_prod) {
+		BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
+		mcl = npo.mcl + npo.mcl_prod++;
+		mcl->op = __HYPERVISOR_grant_table_op;
+		mcl->args[0] = GNTTABOP_copy;
+		mcl->args[1] = (unsigned long)netbk->grant_copy_op;
+		mcl->args[2] = npo.copy_prod;
+	}
+
+	/* Nothing to do? */
+	if (!npo.mcl_prod)
+		return;
+
+	BUG_ON(npo.mcl_prod > ARRAY_SIZE(netbk->rx_mcl));
+
+	ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
+	BUG_ON(ret != 0);
+	/* The mmu_machphys_update() must not fail. */
+	BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0);
+
+	while ((skb = __skb_dequeue(&rxq)) != NULL) {
+		nr_frags = *(int *)skb->cb;
+
+		netif = netdev_priv(skb->dev);
+
+		status = netbk_check_gop(nr_frags, netif->domid, &npo);
+
+		/* We can't rely on skb_release_data to release the
+		   pages used by fragments for us, since it tries to
+		   touch the pages in the fraglist.  If we're in
+		   flipping mode, that doesn't work.  In copying mode,
+		   we still have access to all of the pages, and so
+		   it's safe to let release_data deal with it. */
+		/* (Freeing the fragments is safe since we copy
+		   non-linear skbs destined for flipping interfaces) */
+		if (!netif->copying_receiver) {
+			atomic_set(&(skb_shinfo(skb)->dataref), 1);
+			skb_shinfo(skb)->frag_list = NULL;
+			skb_shinfo(skb)->nr_frags = 0;
+			netbk_free_pages(nr_frags, netbk->meta + npo.meta_cons + 1);
+		}
+
+		skb->dev->stats.tx_bytes += skb->len;
+		skb->dev->stats.tx_packets++;
+
+		id = netbk->meta[npo.meta_cons].id;
+		flags = nr_frags ? XEN_NETRXF_more_data : 0;
+
+		switch (skb->ip_summed) {
+		case CHECKSUM_PARTIAL: /* local packet? */
+			flags |= XEN_NETRXF_csum_blank |
+				 XEN_NETRXF_data_validated;
+			break;
+		case CHECKSUM_UNNECESSARY: /* remote but checksummed? */
+			flags |= XEN_NETRXF_data_validated;
+			break;
+		}
+
+		if (netbk->meta[npo.meta_cons].copy)
+			offset = 0;
+		else
+			offset = offset_in_page(skb->data);
+		resp = make_rx_response(netif, id, status, offset,
+					skb_headlen(skb), flags);
+
+		if (netbk->meta[npo.meta_cons].frag.size) {
+			struct netif_extra_info *gso =
+				(struct netif_extra_info *)
+				RING_GET_RESPONSE(&netif->rx,
+						  netif->rx.rsp_prod_pvt++);
+
+			resp->flags |= XEN_NETRXF_extra_info;
+
+			gso->u.gso.size = netbk->meta[npo.meta_cons].frag.size;
+			gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+			gso->u.gso.pad = 0;
+			gso->u.gso.features = 0;
+
+			gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+			gso->flags = 0;
+		}
+
+		netbk_add_frag_responses(netif, status,
+					 netbk->meta + npo.meta_cons + 1,
+					 nr_frags);
+
+		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
+		irq = netif->irq - DYNIRQ_BASE;
+		if (ret && !__test_and_set_bit(irq, netbk->rx_notify))
+			netbk->notify_list[notify_nr++] = irq;
+
+		if (netif_queue_stopped(netif->dev) &&
+		    netif_schedulable(netif) &&
+		    !netbk_queue_full(netif))
+			netif_wake_queue(netif->dev);
+
+		netif_put(netif);
+		dev_kfree_skb(skb);
+
+		npo.meta_cons += nr_frags + 1;
+	}
+
+	if (notify_nr == 1) {
+		irq = *netbk->notify_list;
+		__clear_bit(irq, netbk->rx_notify);
+		notify_remote_via_irq(irq + DYNIRQ_BASE);
+	} else {
+		for (count = ret = 0; ret < notify_nr; ++ret) {
+			irq = netbk->notify_list[ret];
+			__clear_bit(irq, netbk->rx_notify);
+			if (!multi_notify_remote_via_irq(netbk->rx_mcl + count,
+							 irq + DYNIRQ_BASE))
+				++count;
+		}
+		if (HYPERVISOR_multicall(netbk->rx_mcl, count))
+			BUG();
+	}
+
+	/* More work to do? */
+	if (!skb_queue_empty(&netbk->rx_queue) &&
+	    !timer_pending(&netbk->net_timer))
+		netbk_schedule(netbk);
+#if 0
+	else
+		xen_network_done_notify();
+#endif
+}
+
+static int __on_net_schedule_list(netif_t *netif)
+{
+	return netif->list.next != NULL;
+}
+
+/* Must be called with netbk->schedule_list_lock held. */
+static void remove_from_net_schedule_list(netif_t *netif)
+{
+	if (likely(__on_net_schedule_list(netif))) {
+		list_del(&netif->list);
+		netif->list.next = NULL;
+		netif_put(netif);
+	}
+}
+
+static netif_t *poll_net_schedule_list(struct xen_netbk *netbk)
+{
+	netif_t *netif = NULL;
+
+	spin_lock_irq(&netbk->schedule_list_lock);
+	if (!list_empty(&netbk->schedule_list)) {
+		netif = list_first_entry(&netbk->schedule_list, netif_t, list);
+		netif_get(netif);
+		remove_from_net_schedule_list(netif);
+	}
+	spin_unlock_irq(&netbk->schedule_list_lock);
+	return netif;
+}
+
+static void add_to_net_schedule_list_tail(netif_t *netif)
+{
+	struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
+	unsigned long flags;
+
+	if (__on_net_schedule_list(netif))
+		return;
+
+	spin_lock_irqsave(&netbk->schedule_list_lock, flags);
+	if (!__on_net_schedule_list(netif) &&
+	    likely(netif_schedulable(netif))) {
+		list_add_tail(&netif->list, &netbk->schedule_list);
+		netif_get(netif);
+	}
+	spin_unlock_irqrestore(&netbk->schedule_list_lock, flags);
+}
+
+/*
+ * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
+ * If this driver is pipelining transmit requests then we can be very
+ * aggressive in avoiding new-packet notifications -- frontend only needs to
+ * send a notification if there are no outstanding unreceived responses.
+ * If we may be buffer transmit buffers for any reason then we must be rather
+ * more conservative and treat this as the final check for pending work.
+ */
+void netif_schedule_work(netif_t *netif)
+{
+	int more_to_do;
+
+#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
+	more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
+#else
+	RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
+#endif
+
+	if (more_to_do) {
+		add_to_net_schedule_list_tail(netif);
+		maybe_schedule_tx_action(GET_GROUP_INDEX(netif));
+	}
+}
+
+void netif_deschedule_work(netif_t *netif)
+{
+	struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
+
+	spin_lock_irq(&netbk->schedule_list_lock);
+	remove_from_net_schedule_list(netif);
+	spin_unlock_irq(&netbk->schedule_list_lock);
+}
+
+
+static void tx_add_credit(netif_t *netif)
+{
+	unsigned long max_burst, max_credit;
+
+	/*
+	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
+	 * Otherwise the interface can seize up due to insufficient credit.
+	 */
+	max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
+	max_burst = min(max_burst, 131072UL);
+	max_burst = max(max_burst, netif->credit_bytes);
+
+	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
+	max_credit = netif->remaining_credit + netif->credit_bytes;
+	if (max_credit < netif->remaining_credit)
+		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
+
+	netif->remaining_credit = min(max_credit, max_burst);
+}
+
+static void tx_credit_callback(unsigned long data)
+{
+	netif_t *netif = (netif_t *)data;
+	tx_add_credit(netif);
+	netif_schedule_work(netif);
+}
+
+static inline int copy_pending_req(struct xen_netbk *netbk,
+				   pending_ring_idx_t pending_idx)
+{
+	return gnttab_copy_grant_page(netbk->grant_tx_handle[pending_idx],
+				      &netbk->mmap_pages[pending_idx]);
+}
+
+static void permute_dealloc_ring(u16 *dealloc_ring, pending_ring_idx_t dc,
+				 pending_ring_idx_t dp)
+{
+	static unsigned random_src = 0x12345678;
+	unsigned dst_offset;
+	pending_ring_idx_t dest;
+	u16 tmp;
+
+	while (dc != dp) {
+		dst_offset = (random_src / 256) % (dp - dc);
+		dest = dc + dst_offset;
+		tmp = dealloc_ring[MASK_PEND_IDX(dest)];
+		dealloc_ring[MASK_PEND_IDX(dest)] =
+			dealloc_ring[MASK_PEND_IDX(dc)];
+		dealloc_ring[MASK_PEND_IDX(dc)] = tmp;
+		dc++;
+		random_src *= 68389;
+	}
+}
+
+inline static void net_tx_action_dealloc(struct xen_netbk *netbk)
+{
+	struct netbk_tx_pending_inuse *inuse, *n;
+	gnttab_unmap_grant_ref_t *gop;
+	u16 pending_idx;
+	pending_ring_idx_t dc, dp;
+	netif_t *netif;
+	LIST_HEAD(list);
+
+	dc = netbk->dealloc_cons;
+	gop = netbk->tx_unmap_ops;
+
+	/*
+	 * Free up any grants we have finished using
+	 */
+	do {
+		dp = netbk->dealloc_prod;
+
+		/* Ensure we see all indices enqueued by netif_idx_release(). */
+		smp_rmb();
+
+		if (MODPARM_permute_returns && netbk_nr_groups == 1)
+			permute_dealloc_ring(netbk->dealloc_ring, dc, dp);
+
+		while (dc != dp) {
+			unsigned long pfn;
+			struct netbk_tx_pending_inuse *pending_inuse =
+					netbk->pending_inuse;
+
+			pending_idx = netbk->dealloc_ring[MASK_PEND_IDX(dc++)];
+			list_move_tail(&pending_inuse[pending_idx].list, &list);
+
+			pfn = idx_to_pfn(netbk, pending_idx);
+			/* Already unmapped? */
+			if (!phys_to_machine_mapping_valid(pfn))
+				continue;
+
+			gnttab_set_unmap_op(gop, idx_to_kaddr(netbk, pending_idx),
+					    GNTMAP_host_map,
+					    netbk->grant_tx_handle[pending_idx]);
+			gop++;
+		}
+
+	} while (dp != netbk->dealloc_prod);
+
+	netbk->dealloc_cons = dc;
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+				      netbk->tx_unmap_ops,
+				      gop - netbk->tx_unmap_ops))
+		BUG();
+
+	/* Copy any entries that have been pending for too long. */
+	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+	    !list_empty(&netbk->pending_inuse_head)) {
+		list_for_each_entry_safe(inuse, n, &netbk->pending_inuse_head, list) {
+			struct pending_tx_info *pending_tx_info
+				= netbk->pending_tx_info;
+
+			if (time_after(inuse->alloc_time + HZ / 2, jiffies))
+				break;
+
+			pending_idx = inuse - netbk->pending_inuse;
+
+			pending_tx_info[pending_idx].netif->nr_copied_skbs++;
+
+			switch (copy_pending_req(netbk, pending_idx)) {
+			case 0:
+				list_move_tail(&inuse->list, &list);
+				continue;
+			case -EBUSY:
+				list_del_init(&inuse->list);
+				continue;
+			case -ENOENT:
+				continue;
+			}
+
+			break;
+		}
+	}
+
+	list_for_each_entry_safe(inuse, n, &list, list) {
+		struct pending_tx_info *pending_tx_info =
+			netbk->pending_tx_info;
+
+		pending_idx = inuse - netbk->pending_inuse;
+		netif = pending_tx_info[pending_idx].netif;
+
+		make_tx_response(netif, &pending_tx_info[pending_idx].req, 
+				 XEN_NETIF_RSP_OKAY);
+
+		/* Ready for next use. */
+		gnttab_reset_grant_page(netbk->mmap_pages[pending_idx]);
+
+		netbk->pending_ring[MASK_PEND_IDX(netbk->pending_prod++)] =
+			pending_idx;
+
+		netif_put(netif);
+
+		list_del_init(&inuse->list);
+	}
+}
+
+static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
+{
+	RING_IDX cons = netif->tx.req_cons;
+
+	do {
+		make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
+		if (cons >= end)
+			break;
+		txp = RING_GET_REQUEST(&netif->tx, cons++);
+	} while (1);
+	netif->tx.req_cons = cons;
+	netif_schedule_work(netif);
+	netif_put(netif);
+}
+
+static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first,
+				netif_tx_request_t *txp, int work_to_do)
+{
+	RING_IDX cons = netif->tx.req_cons;
+	int frags = 0;
+
+	if (!(first->flags & XEN_NETTXF_more_data))
+		return 0;
+
+	do {
+		if (frags >= work_to_do) {
+			DPRINTK("Need more frags\n");
+			return -frags;
+		}
+
+		if (unlikely(frags >= MAX_SKB_FRAGS)) {
+			DPRINTK("Too many frags\n");
+			return -frags;
+		}
+
+		memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
+		       sizeof(*txp));
+		if (txp->size > first->size) {
+			DPRINTK("Frags galore\n");
+			return -frags;
+		}
+
+		first->size -= txp->size;
+		frags++;
+
+		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+			DPRINTK("txp->offset: %x, size: %u\n",
+				txp->offset, txp->size);
+			return -frags;
+		}
+	} while ((txp++)->flags & XEN_NETTXF_more_data);
+
+	return frags;
+}
+
+static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
+						  struct sk_buff *skb,
+						  netif_tx_request_t *txp,
+						  gnttab_map_grant_ref_t *mop)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	skb_frag_t *frags = shinfo->frags;
+	u16 pending_idx = *(u16 *)skb->data;
+	int i, start;
+
+	/* Skip first skb fragment if it is on same page as header fragment. */
+	start = (frag_get_pending_idx(frags) == pending_idx);
+
+	for (i = start; i < shinfo->nr_frags; i++, txp++) {
+		struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
+		pending_ring_idx_t index = MASK_PEND_IDX(netbk->pending_cons++);
+		struct pending_tx_info *pending_tx_info =
+			netbk->pending_tx_info;
+
+		pending_idx = netbk->pending_ring[index];
+
+		gnttab_set_map_op(mop++, idx_to_kaddr(netbk, pending_idx),
+				  GNTMAP_host_map | GNTMAP_readonly,
+				  txp->gref, netif->domid);
+
+		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+		netif_get(netif);
+		pending_tx_info[pending_idx].netif = netif;
+		frag_set_pending_idx(&frags[i], pending_idx);
+	}
+
+	return mop;
+}
+
+static int netbk_tx_check_mop(struct xen_netbk *netbk, struct sk_buff *skb,
+			      gnttab_map_grant_ref_t **mopp)
+{
+	gnttab_map_grant_ref_t *mop = *mopp;
+	u16 pending_idx = *(u16 *)skb->data;
+	struct pending_tx_info *pending_tx_info = netbk->pending_tx_info;
+	netif_t *netif = pending_tx_info[pending_idx].netif;
+	netif_tx_request_t *txp;
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	int nr_frags = shinfo->nr_frags;
+	int i, err, start;
+
+	/* Check status of header. */
+	err = mop->status;
+	if (unlikely(err != GNTST_okay)) {
+		pending_ring_idx_t index = MASK_PEND_IDX(netbk->pending_prod++);
+
+		txp = &pending_tx_info[pending_idx].req;
+		make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
+		netbk->pending_ring[index] = pending_idx;
+		netif_put(netif);
+	} else {
+		set_phys_to_machine(idx_to_pfn(netbk, pending_idx),
+			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
+		netbk->grant_tx_handle[pending_idx] = mop->handle;
+	}
+
+	/* Skip first skb fragment if it is on same page as header fragment. */
+	start = (frag_get_pending_idx(shinfo->frags) == pending_idx);
+
+	for (i = start; i < nr_frags; i++) {
+		int j, newerr;
+		pending_ring_idx_t index;
+
+		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
+
+		/* Check error status: if okay then remember grant handle. */
+		newerr = (++mop)->status;
+		if (likely(newerr == GNTST_okay)) {
+			set_phys_to_machine(idx_to_pfn(netbk, pending_idx),
+				FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
+			netbk->grant_tx_handle[pending_idx] = mop->handle;
+			/* Had a previous error? Invalidate this fragment. */
+			if (unlikely(err != GNTST_okay))
+				netif_idx_release(netbk, pending_idx);
+			continue;
+		}
+
+		/* Error on this fragment: respond to client with an error. */
+		txp = &pending_tx_info[pending_idx].req;
+		make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
+		index = MASK_PEND_IDX(netbk->pending_prod++);
+		netbk->pending_ring[index] = pending_idx;
+		netif_put(netif);
+
+		/* Not the first error? Preceding frags already invalidated. */
+		if (err != GNTST_okay)
+			continue;
+
+		/* First error: invalidate header and preceding fragments. */
+		pending_idx = *((u16 *)skb->data);
+		netif_idx_release(netbk, pending_idx);
+		for (j = start; j < i; j++) {
+			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+			netif_idx_release(netbk, pending_idx);
+		}
+
+		/* Remember the error: invalidate all subsequent fragments. */
+		err = newerr;
+	}
+
+	*mopp = mop + 1;
+	return err;
+}
+
+static void netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	int nr_frags = shinfo->nr_frags;
+	int i;
+
+	for (i = 0; i < nr_frags; i++) {
+		netif_tx_request_t *txp;
+		u16 pending_idx = frag_get_pending_idx(shinfo->frags + i);
+
+		netbk->pending_inuse[pending_idx].alloc_time = jiffies;
+		list_add_tail(&netbk->pending_inuse[pending_idx].list,
+			      &netbk->pending_inuse_head);
+
+		txp = &netbk->pending_tx_info[pending_idx].req;
+		__skb_fill_page_desc(skb, i, netbk->mmap_pages[pending_idx],
+				     txp->offset, txp->size);
+
+		skb->len += txp->size;
+		skb->data_len += txp->size;
+		skb->truesize += txp->size;
+	}
+}
+
+int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
+		     int work_to_do)
+{
+	struct netif_extra_info extra;
+	RING_IDX cons = netif->tx.req_cons;
+
+	do {
+		if (unlikely(work_to_do-- <= 0)) {
+			DPRINTK("Missing extra info\n");
+			return -EBADR;
+		}
+
+		memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
+		       sizeof(extra));
+		if (unlikely(!extra.type ||
+			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+			netif->tx.req_cons = ++cons;
+			DPRINTK("Invalid extra type: %d\n", extra.type);
+			return -EINVAL;
+		}
+
+		memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
+		netif->tx.req_cons = ++cons;
+	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+	return work_to_do;
+}
+
+static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
+{
+	if (!gso->u.gso.size) {
+		DPRINTK("GSO size must not be zero.\n");
+		return -EINVAL;
+	}
+
+	/* Currently only TCPv4 S.O. is supported. */
+	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+		DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+		return -EINVAL;
+	}
+
+	skb_shinfo(skb)->gso_size = gso->u.gso.size;
+	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+	/* Header must be checked, and gso_segs computed. */
+	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+	skb_shinfo(skb)->gso_segs = 0;
+
+	return 0;
+}
+
+/* Called after netfront has transmitted */
+static void net_tx_action(unsigned long group)
+{
+	struct xen_netbk *netbk = &xen_netbk[group];
+	struct sk_buff *skb;
+	netif_t *netif;
+	netif_tx_request_t txreq;
+	netif_tx_request_t txfrags[MAX_SKB_FRAGS];
+	struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+	u16 pending_idx;
+	RING_IDX i;
+	gnttab_map_grant_ref_t *mop;
+	unsigned int data_len;
+	int ret, work_to_do;
+
+	net_tx_action_dealloc(netbk);
+
+	mop = netbk->tx_map_ops;
+	BUILD_BUG_ON(MAX_SKB_FRAGS >= MAX_PENDING_REQS);
+	while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+	       !list_empty(&netbk->schedule_list)) {
+		/* Get a netif from the list with work to do. */
+		netif = poll_net_schedule_list(netbk);
+		if (!netif)
+			continue;
+
+		RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
+		if (!work_to_do) {
+			netif_put(netif);
+			continue;
+		}
+
+		i = netif->tx.req_cons;
+		rmb(); /* Ensure that we see the request before we copy it. */
+		memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
+
+		/* Credit-based scheduling. */
+		if (txreq.size > netif->remaining_credit) {
+			unsigned long now = jiffies;
+			unsigned long next_credit = 
+				netif->credit_timeout.expires +
+				msecs_to_jiffies(netif->credit_usec / 1000);
+
+			/* Timer could already be pending in rare cases. */
+			if (timer_pending(&netif->credit_timeout)) {
+				netif_put(netif);
+				continue;
+			}
+
+			/* Passed the point where we can replenish credit? */
+			if (time_after_eq(now, next_credit)) {
+				netif->credit_timeout.expires = now;
+				tx_add_credit(netif);
+			}
+
+			/* Still too big to send right now? Set a callback. */
+			if (txreq.size > netif->remaining_credit) {
+				netif->credit_timeout.data     =
+					(unsigned long)netif;
+				netif->credit_timeout.function =
+					tx_credit_callback;
+				mod_timer(&netif->credit_timeout, next_credit);
+				netif_put(netif);
+				continue;
+			}
+		}
+		netif->remaining_credit -= txreq.size;
+
+		work_to_do--;
+		netif->tx.req_cons = ++i;
+
+		memset(extras, 0, sizeof(extras));
+		if (txreq.flags & XEN_NETTXF_extra_info) {
+			work_to_do = netbk_get_extras(netif, extras,
+						      work_to_do);
+			i = netif->tx.req_cons;
+			if (unlikely(work_to_do < 0)) {
+				netbk_tx_err(netif, &txreq, i);
+				continue;
+			}
+		}
+
+		ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
+		if (unlikely(ret < 0)) {
+			netbk_tx_err(netif, &txreq, i - ret);
+			continue;
+		}
+		i += ret;
+
+		if (unlikely(txreq.size < ETH_HLEN)) {
+			DPRINTK("Bad packet size: %d\n", txreq.size);
+			netbk_tx_err(netif, &txreq, i);
+			continue;
+		}
+
+		/* No crossing a page as the payload mustn't fragment. */
+		if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
+			DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
+				txreq.offset, txreq.size, 
+				(txreq.offset &~PAGE_MASK) + txreq.size);
+			netbk_tx_err(netif, &txreq, i);
+			continue;
+		}
+
+		pending_idx = netbk->pending_ring[MASK_PEND_IDX(netbk->pending_cons)];
+
+		data_len = (txreq.size > PKT_PROT_LEN &&
+			    ret < MAX_SKB_FRAGS) ?
+			PKT_PROT_LEN : txreq.size;
+
+		skb = alloc_skb(data_len + 16 + NET_IP_ALIGN,
+				GFP_ATOMIC | __GFP_NOWARN);
+		if (unlikely(skb == NULL)) {
+			DPRINTK("Can't allocate a skb in start_xmit.\n");
+			netbk_tx_err(netif, &txreq, i);
+			break;
+		}
+
+		/* Packets passed to netif_rx() must have some headroom. */
+		skb_reserve(skb, 16 + NET_IP_ALIGN);
+
+		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+			struct netif_extra_info *gso;
+			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+			if (netbk_set_skb_gso(skb, gso)) {
+				kfree_skb(skb);
+				netbk_tx_err(netif, &txreq, i);
+				continue;
+			}
+		}
+
+		gnttab_set_map_op(mop, idx_to_kaddr(netbk, pending_idx),
+				  GNTMAP_host_map | GNTMAP_readonly,
+				  txreq.gref, netif->domid);
+		mop++;
+
+		memcpy(&netbk->pending_tx_info[pending_idx].req,
+		       &txreq, sizeof(txreq));
+		netbk->pending_tx_info[pending_idx].netif = netif;
+		*((u16 *)skb->data) = pending_idx;
+
+		__skb_put(skb, data_len);
+
+		skb_shinfo(skb)->nr_frags = ret;
+		if (data_len < txreq.size)
+			skb_shinfo(skb)->nr_frags++;
+		else
+			pending_idx = INVALID_PENDING_IDX;
+		frag_set_pending_idx(skb_shinfo(skb)->frags, pending_idx);
+
+		__skb_queue_tail(&netbk->tx_queue, skb);
+
+		netbk->pending_cons++;
+
+		mop = netbk_get_requests(netif, skb, txfrags, mop);
+
+		netif->tx.req_cons = i;
+		netif_schedule_work(netif);
+
+		if ((mop - netbk->tx_map_ops) >= ARRAY_SIZE(netbk->tx_map_ops))
+			break;
+	}
+
+	if (mop == netbk->tx_map_ops)
+		goto out;
+
+    /* NOTE: some maps may fail with GNTST_eagain, which could be successfully
+     * retried in the backend after a delay. However, we can also fail the tx
+     * req and let the frontend resend the relevant packet again. This is fine
+     * because it is unlikely that a network buffer will be paged out or shared,
+     * and therefore it is unlikely to fail with GNTST_eagain. */
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+					netbk->tx_map_ops,
+					mop - netbk->tx_map_ops);
+	BUG_ON(ret);
+
+	mop = netbk->tx_map_ops;
+	while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) {
+		struct net_device *dev;
+		netif_tx_request_t *txp;
+
+		pending_idx = *((u16 *)skb->data);
+		netif       = netbk->pending_tx_info[pending_idx].netif;
+		dev         = netif->dev;
+		txp         = &netbk->pending_tx_info[pending_idx].req;
+
+		/* Check the remap error code. */
+		if (unlikely(netbk_tx_check_mop(netbk, skb, &mop))) {
+			DPRINTK("netback grant failed.\n");
+			skb_shinfo(skb)->nr_frags = 0;
+			kfree_skb(skb);
+			dev->stats.rx_dropped++;
+			continue;
+		}
+
+		data_len = skb->len;
+		memcpy(skb->data,
+		       (void *)(idx_to_kaddr(netbk, pending_idx)|txp->offset),
+		       data_len);
+		if (data_len < txp->size) {
+			/* Append the packet payload as a fragment. */
+			txp->offset += data_len;
+			txp->size -= data_len;
+		} else {
+			/* Schedule a response immediately. */
+			netif_idx_release(netbk, pending_idx);
+		}
+
+		if (txp->flags & XEN_NETTXF_csum_blank)
+			skb->ip_summed = CHECKSUM_PARTIAL;
+		else if (txp->flags & XEN_NETTXF_data_validated)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		else
+			skb->ip_summed = CHECKSUM_NONE;
+
+		netbk_fill_frags(netbk, skb);
+
+		/*
+		 * If the initial fragment was < PKT_PROT_LEN then
+		 * pull through some bytes from the other fragments to
+		 * increase the linear region to PKT_PROT_LEN bytes.
+		 */
+		if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
+			int target = min_t(int, skb->len, PKT_PROT_LEN);
+			__pskb_pull_tail(skb, target - skb_headlen(skb));
+		}
+
+		skb->protocol = eth_type_trans(skb, dev);
+
+		if (skb_checksum_setup(skb, &netif->rx_gso_csum_fixups)) {
+			DPRINTK("Can't setup checksum in net_tx_action\n");
+			kfree_skb(skb);
+			continue;
+		}
+
+		if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
+		    unlikely(skb_linearize(skb))) {
+			DPRINTK("Can't linearize skb in net_tx_action.\n");
+			kfree_skb(skb);
+			dev->stats.rx_errors++;
+			continue;
+		}
+
+		dev->stats.rx_bytes += skb->len;
+		dev->stats.rx_packets++;
+
+		if (use_kthreads)
+			netif_rx_ni(skb);
+		else
+			netif_rx(skb);
+	}
+
+ out:
+	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+	    !list_empty(&netbk->pending_inuse_head)) {
+		struct netbk_tx_pending_inuse *oldest;
+
+		oldest = list_entry(netbk->pending_inuse_head.next,
+				    struct netbk_tx_pending_inuse, list);
+		mod_timer(&netbk->tx_pending_timer, oldest->alloc_time + HZ);
+	}
+}
+
+static void netif_idx_release(struct xen_netbk *netbk, u16 pending_idx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&netbk->release_lock, flags);
+	netbk->dealloc_ring[MASK_PEND_IDX(netbk->dealloc_prod)] = pending_idx;
+	/* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
+	smp_wmb();
+	netbk->dealloc_prod++;
+	spin_unlock_irqrestore(&netbk->release_lock, flags);
+
+	netbk_schedule(netbk);
+}
+
+static void netif_page_release(struct page *page, unsigned int order)
+{
+	unsigned int idx = netif_page_index(page);
+	unsigned int group = netif_page_group(page);
+	struct xen_netbk *netbk = &xen_netbk[group];
+
+	BUG_ON(order);
+	BUG_ON(group >= netbk_nr_groups || idx >= MAX_PENDING_REQS);
+	BUG_ON(netbk->mmap_pages[idx] != page);
+	netif_idx_release(netbk, idx);
+}
+
+irqreturn_t netif_be_int(int irq, void *dev_id)
+{
+	netif_t *netif = dev_id;
+	unsigned int group = GET_GROUP_INDEX(netif);
+
+	if (unlikely(group >= netbk_nr_groups)) {
+		/*
+		 * Short of having a way to bind the IRQ in disabled mode
+		 * (IRQ_NOAUTOEN), we have to ignore the first invocation(s)
+		 * (before we got assigned to a group).
+		 */
+		BUG_ON(group != UINT_MAX);
+		return IRQ_HANDLED;
+	}
+
+	add_to_net_schedule_list_tail(netif);
+	maybe_schedule_tx_action(group);
+
+	if (netif_schedulable(netif) && !netbk_queue_full(netif))
+		netif_wake_queue(netif->dev);
+
+	return IRQ_HANDLED;
+}
+
+static void make_tx_response(netif_t *netif, 
+			     netif_tx_request_t *txp,
+			     s8       st)
+{
+	RING_IDX i = netif->tx.rsp_prod_pvt;
+	netif_tx_response_t *resp;
+	int notify;
+
+	resp = RING_GET_RESPONSE(&netif->tx, i);
+	resp->id     = txp->id;
+	resp->status = st;
+
+	if (txp->flags & XEN_NETTXF_extra_info)
+		RING_GET_RESPONSE(&netif->tx, ++i)->status = XEN_NETIF_RSP_NULL;
+
+	netif->tx.rsp_prod_pvt = ++i;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
+	if (notify)
+		notify_remote_via_irq(netif->irq);
+
+#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
+	if (i == netif->tx.req_cons) {
+		int more_to_do;
+		RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
+		if (more_to_do)
+			add_to_net_schedule_list_tail(netif);
+	}
+#endif
+}
+
+static netif_rx_response_t *make_rx_response(netif_t *netif, 
+					     u16      id, 
+					     s8       st,
+					     u16      offset,
+					     u16      size,
+					     u16      flags)
+{
+	RING_IDX i = netif->rx.rsp_prod_pvt;
+	netif_rx_response_t *resp;
+
+	resp = RING_GET_RESPONSE(&netif->rx, i);
+	resp->offset     = offset;
+	resp->flags      = flags;
+	resp->id         = id;
+	resp->status     = (s16)size;
+	if (st < 0)
+		resp->status = (s16)st;
+
+	netif->rx.rsp_prod_pvt = ++i;
+
+	return resp;
+}
+
+#ifdef NETBE_DEBUG_INTERRUPT
+static irqreturn_t netif_be_dbg(int irq, void *dev_id)
+{
+	netif_t *netif;
+	unsigned int i = 0, group;
+
+	pr_alert("netif_schedule_list:\n");
+
+	for (group = 0; group < netbk_nr_groups; ++group) {
+		struct xen_netbk *netbk = &xen_netbk[group];
+
+		spin_lock_irq(&netbk->schedule_list_lock);
+
+		list_for_each_entry(netif, &netbk->schedule_list, list) {
+			pr_alert(" %d: private(rx_req_cons=%08x "
+				 "rx_resp_prod=%08x\n", i,
+				 netif->rx.req_cons, netif->rx.rsp_prod_pvt);
+			pr_alert("   tx_req_cons=%08x tx_resp_prod=%08x)\n",
+				 netif->tx.req_cons, netif->tx.rsp_prod_pvt);
+			pr_alert("   shared(rx_req_prod=%08x "
+				 "rx_resp_prod=%08x\n",
+				 netif->rx.sring->req_prod,
+				 netif->rx.sring->rsp_prod);
+			pr_alert("   rx_event=%08x tx_req_prod=%08x\n",
+				 netif->rx.sring->rsp_event,
+				 netif->tx.sring->req_prod);
+			pr_alert("   tx_resp_prod=%08x, tx_event=%08x)\n",
+				 netif->tx.sring->rsp_prod,
+				 netif->tx.sring->rsp_event);
+			i++;
+		}
+
+		spin_unlock_irq(&netbk->netbk->schedule_list_lock);
+	}
+
+	pr_alert(" ** End of netif_schedule_list **\n");
+
+	return IRQ_HANDLED;
+}
+
+static struct irqaction netif_be_dbg_action = {
+	.handler = netif_be_dbg,
+	.flags   = IRQF_SHARED,
+	.name    = "net-be-dbg"
+};
+#endif
+
+static inline int rx_work_todo(struct xen_netbk *netbk)
+{
+	return !skb_queue_empty(&netbk->rx_queue);
+}
+
+static inline int tx_work_todo(struct xen_netbk *netbk)
+{
+	if (netbk->dealloc_cons != netbk->dealloc_prod)
+		return 1;
+
+	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+	    !list_empty(&netbk->pending_inuse_head))
+		return 1;
+
+	if (nr_pending_reqs(netbk) + MAX_SKB_FRAGS < MAX_PENDING_REQS &&
+	    !list_empty(&netbk->schedule_list))
+		return 1;
+
+	return 0;
+}
+
+static int netbk_action_thread(void *index)
+{
+	unsigned long group = (unsigned long)index;
+	struct xen_netbk *netbk = &xen_netbk[group];
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(netbk->netbk_action_wq,
+					 rx_work_todo(netbk) ||
+					 tx_work_todo(netbk) ||
+					 kthread_should_stop());
+		cond_resched();
+
+		if (rx_work_todo(netbk))
+			net_rx_action(group);
+
+		if (tx_work_todo(netbk))
+			net_tx_action(group);
+	}
+
+	return 0;
+}
+
+
+static int __init netback_init(void)
+{
+	unsigned int i, group;
+	int rc;
+	struct page *page;
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	group = netbk_nr_groups;
+	if (!netbk_nr_groups)
+		netbk_nr_groups = (num_online_cpus() + 1) / 2;
+	if (netbk_nr_groups > MAX_GROUPS)
+		netbk_nr_groups = MAX_GROUPS;
+
+	do {
+		xen_netbk = vzalloc(netbk_nr_groups * sizeof(*xen_netbk));
+	} while (!xen_netbk && (netbk_nr_groups >>= 1));
+	if (!xen_netbk) {
+		pr_err("%s: out of memory\n", __func__);
+		return -ENOMEM;
+	}
+	if (group && netbk_nr_groups != group)
+		pr_warn("netback: only using %u (instead of %u) groups\n",
+			netbk_nr_groups, group);
+
+	/* We can increase reservation by this much in net_rx_action(). */
+	balloon_update_driver_allowance(netbk_nr_groups * NET_RX_RING_SIZE);
+
+	for (group = 0; group < netbk_nr_groups; group++) {
+		struct xen_netbk *netbk = &xen_netbk[group];
+
+		skb_queue_head_init(&netbk->rx_queue);
+		skb_queue_head_init(&netbk->tx_queue);
+
+		init_timer(&netbk->net_timer);
+		netbk->net_timer.data = group;
+		netbk->net_timer.function = netbk_schedule_group;
+
+		init_timer(&netbk->tx_pending_timer);
+		netbk->tx_pending_timer.data = group;
+		netbk->tx_pending_timer.function = netbk_schedule_group;
+
+		netbk->pending_prod = MAX_PENDING_REQS;
+
+		INIT_LIST_HEAD(&netbk->pending_inuse_head);
+		INIT_LIST_HEAD(&netbk->schedule_list);
+
+		spin_lock_init(&netbk->schedule_list_lock);
+		spin_lock_init(&netbk->release_lock);
+
+		netbk->mmap_pages =
+			alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
+		if (netbk->mmap_pages == NULL) {
+			pr_err("%s: out of memory\n", __func__);
+			rc = -ENOMEM;
+			goto failed_init;
+		}
+
+		for (i = 0; i < MAX_PENDING_REQS; i++) {
+			page = netbk->mmap_pages[i];
+			SetPageForeign(page, netif_page_release);
+			netif_set_page_ext(page, group, i);
+			netbk->pending_ring[i] = i;
+			INIT_LIST_HEAD(&netbk->pending_inuse[i].list);
+		}
+
+		if (use_kthreads) {
+			init_waitqueue_head(&netbk->netbk_action_wq);
+			netbk->task = kthread_create(netbk_action_thread,
+						     (void *)(long)group,
+						     "netback/%u", group);
+
+			if (IS_ERR(netbk->task)) {
+				pr_err("netback: kthread_create() failed\n");
+				rc = PTR_ERR(netbk->task);
+				goto failed_init;
+			}
+			if (bind_threads)
+				kthread_bind(netbk->task,
+					     group % num_online_cpus());
+			wake_up_process(netbk->task);
+		} else {
+			tasklet_init(&netbk->net_tx_tasklet, net_tx_action, group);
+			tasklet_init(&netbk->net_rx_tasklet, net_rx_action, group);
+		}
+	}
+
+	netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
+	if (MODPARM_copy_skb) {
+		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
+					      NULL, 0))
+			netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
+		else
+			netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
+	}
+
+	netif_accel_init();
+
+	netif_xenbus_init();
+
+#ifdef NETBE_DEBUG_INTERRUPT
+	(void)bind_virq_to_irqaction(VIRQ_DEBUG,
+				     0,
+				     &netif_be_dbg_action);
+#endif
+
+	return 0;
+
+failed_init:
+	do {
+		struct xen_netbk *netbk = &xen_netbk[group];
+
+		if (use_kthreads && netbk->task && !IS_ERR(netbk->task))
+			kthread_stop(netbk->task);
+		if (netbk->mmap_pages)
+			free_empty_pages_and_pagevec(netbk->mmap_pages,
+						     MAX_PENDING_REQS);
+	} while (group--);
+	vfree(xen_netbk);
+	balloon_update_driver_allowance(-(long)netbk_nr_groups
+					* NET_RX_RING_SIZE);
+
+	return rc;
+}
+
+module_init(netback_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vif");
diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c
new file mode 100644
index 0000000..b73b422
--- /dev/null
+++ b/drivers/xen/netback/xenbus.c
@@ -0,0 +1,494 @@
+/*  Xenbus code for netif backend
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+    Copyright (C) 2005 XenSource Ltd
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdarg.h>
+#include <linux/rwsem.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+#if 0
+#undef DPRINTK
+#define DPRINTK(fmt, args...) \
+    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
+#endif
+
+static DECLARE_RWSEM(teardown_sem);
+
+static int connect_rings(struct backend_info *);
+static void connect(struct backend_info *);
+static void backend_create_netif(struct backend_info *be);
+static void unregister_hotplug_status_watch(struct backend_info *be);
+static void netback_disconnect(struct device *, bool);
+
+static int netback_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	netback_remove_accelerators(be, dev);
+
+	netback_disconnect(&dev->dev, true);
+	kfree(be);
+	return 0;
+}
+
+static void netback_disconnect(struct device *xbdev_dev, bool clear)
+{
+	struct backend_info *be = dev_get_drvdata(xbdev_dev);
+
+	unregister_hotplug_status_watch(be);
+	if (be->netif)
+		kobject_uevent(&xbdev_dev->kobj, KOBJ_OFFLINE);
+
+	xenbus_rm(XBT_NIL, be->dev->nodename, "hotplug-status");
+
+	down_write(&teardown_sem);
+	if (be->netif) {
+		netif_disconnect(be);
+		be->netif = NULL;
+	}
+	if (clear)
+		dev_set_drvdata(xbdev_dev, NULL);
+	up_write(&teardown_sem);
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and switch to InitWait.
+ */
+static int netback_probe(struct xenbus_device *dev,
+			 const struct xenbus_device_id *id)
+{
+	const char *message;
+	struct xenbus_transaction xbt;
+	int err;
+	int sg;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+
+	sg = 1;
+	if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
+		sg = 0;
+
+	do {
+		err = xenbus_transaction_start(&xbt);
+		if (err) {
+			xenbus_dev_fatal(dev, err, "starting transaction");
+			goto fail;
+		}
+
+		err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
+		if (err) {
+			message = "writing feature-sg";
+			goto abort_transaction;
+		}
+
+		err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
+				    "%d", sg);
+		if (err) {
+			message = "writing feature-gso-tcpv4";
+			goto abort_transaction;
+		}
+
+		/* We support rx-copy path. */
+		err = xenbus_printf(xbt, dev->nodename,
+				    "feature-rx-copy", "%d", 1);
+		if (err) {
+			message = "writing feature-rx-copy";
+			goto abort_transaction;
+		}
+
+		/*
+		 * We don't support rx-flip path (except old guests who don't
+		 * grok this feature flag).
+		 */
+		err = xenbus_printf(xbt, dev->nodename,
+				    "feature-rx-flip", "%d", 0);
+		if (err) {
+			message = "writing feature-rx-flip";
+			goto abort_transaction;
+		}
+
+		err = xenbus_transaction_end(xbt, 0);
+	} while (err == -EAGAIN);
+
+	if (err) {
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto fail;
+	}
+
+	netback_probe_accelerators(be, dev);
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	/* This kicks hotplug scripts, so do it immediately. */
+	backend_create_netif(be);
+
+	return 0;
+
+abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, err, "%s", message);
+fail:
+	DPRINTK("failed");
+	netback_remove(dev);
+	return err;
+}
+
+
+/**
+ * Handle the creation of the hotplug script environment.  We add the script
+ * and vif variables to the environment, for the benefit of the vif-* hotplug
+ * scripts.
+ */
+static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
+{
+	struct backend_info *be;
+	char *val;
+
+	DPRINTK("netback_uevent");
+
+	val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
+	if (IS_ERR(val)) {
+		int err = PTR_ERR(val);
+		xenbus_dev_fatal(xdev, err, "reading script");
+		return err;
+	}
+
+	add_uevent_var(env, "script=%s", val);
+	kfree(val);
+
+	down_read(&teardown_sem);
+	be = dev_get_drvdata(&xdev->dev);
+	if (be && be->netif)
+		add_uevent_var(env, "vif=%s", be->netif->dev->name);
+	up_read(&teardown_sem);
+
+	return 0;
+}
+
+
+static void backend_create_netif(struct backend_info *be)
+{
+	int err;
+	long handle;
+	struct xenbus_device *dev = be->dev;
+	netif_t *netif;
+
+	if (be->netif != NULL)
+		return;
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading handle");
+		return;
+	}
+
+	netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
+	if (IS_ERR(netif)) {
+		err = PTR_ERR(netif);
+		xenbus_dev_fatal(dev, err, "creating interface");
+		return;
+	}
+	be->netif = netif;
+
+	kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("%s", xenbus_strstate(frontend_state));
+
+	be->frontend_state = frontend_state;
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			pr_info("%s: %s: prepare for reconnect\n",
+				__FUNCTION__, dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateInitialised:
+		break;
+
+	case XenbusStateConnected:
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		/* backend_create_netif() is idempotent */
+		backend_create_netif(be);
+		if (be->netif)
+			connect(be);
+		break;
+
+	case XenbusStateClosing:
+		netback_disconnect(&dev->dev, false);
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		/* implies netback_disconnect() via netback_remove() */
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+
+static void xen_net_read_rate(struct xenbus_device *dev,
+			      unsigned long *bytes, unsigned long *usec)
+{
+	char *s, *e;
+	unsigned long b, u;
+	char *ratestr;
+
+	/* Default to unlimited bandwidth. */
+	*bytes = ~0UL;
+	*usec = 0;
+
+	ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
+	if (IS_ERR(ratestr))
+		return;
+
+	s = ratestr;
+	b = simple_strtoul(s, &e, 10);
+	if ((s == e) || (*e != ','))
+		goto fail;
+
+	s = e + 1;
+	u = simple_strtoul(s, &e, 10);
+	if ((s == e) || (*e != '\0'))
+		goto fail;
+
+	*bytes = b;
+	*usec = u;
+
+	kfree(ratestr);
+	return;
+
+ fail:
+	WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
+	kfree(ratestr);
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+	char *s, *e, *macstr;
+	int i;
+
+	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+	if (IS_ERR(macstr))
+		return PTR_ERR(macstr);
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac[i] = simple_strtoul(s, &e, 16);
+		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+			kfree(macstr);
+			return -ENOENT;
+		}
+		s = e+1;
+	}
+
+	kfree(macstr);
+	return 0;
+}
+
+static void unregister_hotplug_status_watch(struct backend_info *be)
+{
+	if (be->have_hotplug_status_watch) {
+		unregister_xenbus_watch(&be->hotplug_status_watch);
+		kfree(be->hotplug_status_watch.node);
+	}
+	be->have_hotplug_status_watch = 0;
+}
+
+static void hotplug_status_changed(struct xenbus_watch *watch,
+				   const char **vec,
+				   unsigned int vec_size)
+{
+	struct backend_info *be = container_of(watch,
+					       struct backend_info,
+					       hotplug_status_watch);
+	char *str;
+	unsigned int len;
+
+	str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
+	if (IS_ERR(str))
+		return;
+	if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
+		xenbus_switch_state(be->dev, XenbusStateConnected);
+		/* Not interested in this watch anymore. */
+		unregister_hotplug_status_watch(be);
+	}
+	kfree(str);
+}
+
+static void connect(struct backend_info *be)
+{
+	int err;
+	struct xenbus_device *dev = be->dev;
+
+	err = connect_rings(be);
+	if (err)
+		return;
+
+	err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
+		return;
+	}
+
+	xen_net_read_rate(dev, &be->netif->credit_bytes,
+			  &be->netif->credit_usec);
+	be->netif->remaining_credit = be->netif->credit_bytes;
+
+	unregister_hotplug_status_watch(be);
+	err = xenbus_watch_path2(dev, dev->nodename, "hotplug-status",
+				 &be->hotplug_status_watch,
+				 hotplug_status_changed);
+	if (err) {
+		/* Switch now, since we can't do a watch. */
+		xenbus_switch_state(dev, XenbusStateConnected);
+	} else {
+		be->have_hotplug_status_watch = 1;
+	}
+
+	netif_wake_queue(be->netif->dev);
+}
+
+
+static int connect_rings(struct backend_info *be)
+{
+	netif_t *netif = be->netif;
+	struct xenbus_device *dev = be->dev;
+	unsigned int tx_ring_ref, rx_ring_ref;
+	unsigned int evtchn, rx_copy;
+	int err;
+	int val;
+
+	DPRINTK("");
+
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			    "tx-ring-ref", "%u", &tx_ring_ref,
+			    "rx-ring-ref", "%u", &rx_ring_ref,
+			    "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
+			   &rx_copy);
+	if (err == -ENOENT) {
+		err = 0;
+		rx_copy = 0;
+	}
+	if (err < 0) {
+		xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
+				 dev->otherend);
+		return err;
+	}
+	netif->copying_receiver = !!rx_copy;
+
+	if (netif->dev->tx_queue_len != 0) {
+		if (xenbus_scanf(XBT_NIL, dev->otherend,
+				 "feature-rx-notify", "%d", &val) < 0)
+			val = 0;
+		if (val)
+			netif->can_queue = 1;
+		else
+			/* Must be non-zero for pfifo_fast to work. */
+			netif->dev->tx_queue_len = 1;
+	}
+
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
+		val = 0;
+	netif->can_sg = !!val;
+
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
+			 &val) < 0)
+		val = 0;
+	netif->gso = !!val;
+
+	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
+			 "%d", &val) < 0)
+		val = 0;
+	netif->csum = !val;
+
+	/* Map the shared frame, irq etc. */
+	err = netif_map(be, tx_ring_ref, rx_ring_ref, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "mapping shared-frames %u/%u port %u",
+				 tx_ring_ref, rx_ring_ref, evtchn);
+		return err;
+	}
+	return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id netback_ids[] = {
+	{ "vif" },
+	{ "" }
+};
+
+static DEFINE_XENBUS_DRIVER(netback, ,
+	.probe = netback_probe,
+	.remove = netback_remove,
+	.uevent = netback_uevent,
+	.otherend_changed = frontend_changed,
+);
+
+
+void netif_xenbus_init(void)
+{
+	WARN_ON(xenbus_register_backend(&netback_driver));
+}
diff --git a/drivers/xen/netfront/Makefile b/drivers/xen/netfront/Makefile
new file mode 100644
index 0000000..9c0c6ad
--- /dev/null
+++ b/drivers/xen/netfront/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_XEN_NETDEV_FRONTEND)	:= xennet.o
+
+xennet-objs := netfront.o accel.o
diff --git a/drivers/xen/netfront/accel.c b/drivers/xen/netfront/accel.c
new file mode 100644
index 0000000..f20e0ca
--- /dev/null
+++ b/drivers/xen/netfront/accel.c
@@ -0,0 +1,830 @@
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+
+#include "netfront.h"
+
+#define DPRINTK(fmt, args...)				\
+	pr_debug("netfront/accel (%s:%d) " fmt,		\
+	       __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...) pr_info("netfront/accel: " fmt, ##args)
+#define WPRINTK(fmt, args...) pr_warning("netfront/accel: " fmt, ##args)
+
+static int netfront_remove_accelerator(struct netfront_info *np,
+				       struct xenbus_device *dev);
+static int netfront_load_accelerator(struct netfront_info *np, 
+				     struct xenbus_device *dev, 
+				     const char *frontend);
+
+static void netfront_accelerator_remove_watch(struct netfront_info *np);
+
+/*
+ * List of all netfront accelerator plugin modules available.  Each
+ * list entry is of type struct netfront_accelerator.
+ */ 
+static struct list_head accelerators_list;
+
+/* Workqueue to process acceleration configuration changes */
+struct workqueue_struct *accel_watch_workqueue;
+
+/* Mutex to prevent concurrent loads and suspends, etc. */
+DEFINE_MUTEX(accelerator_mutex);
+
+void netif_init_accel(void)
+{
+	INIT_LIST_HEAD(&accelerators_list);
+
+	accel_watch_workqueue = create_workqueue("net_accel");
+}
+
+void netif_exit_accel(void)
+{
+	struct netfront_accelerator *accelerator, *tmp;
+
+	flush_workqueue(accel_watch_workqueue);
+	destroy_workqueue(accel_watch_workqueue);
+
+	/* No lock required as everything else should be quiet by now */
+	list_for_each_entry_safe(accelerator, tmp, &accelerators_list, link) {
+		BUG_ON(!list_empty(&accelerator->vif_states));
+
+		list_del(&accelerator->link);
+		kfree(accelerator->frontend);
+		kfree(accelerator);
+	}
+}
+
+
+/* 
+ * Watch the configured accelerator and change plugin if it's modified 
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+static void accel_watch_work(struct work_struct *context)
+#else
+static void accel_watch_work(void *context)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+	struct netfront_accel_vif_state *vif_state = 
+		container_of(context, struct netfront_accel_vif_state, 
+			     accel_work);
+#else
+        struct netfront_accel_vif_state *vif_state = 
+		(struct netfront_accel_vif_state *)context;
+#endif
+	struct netfront_info *np = vif_state->np;
+	char *accel_frontend;
+	int accel_len, rc = -1;
+
+	mutex_lock(&accelerator_mutex);
+
+	accel_frontend = xenbus_read(XBT_NIL, np->xbdev->otherend, 
+				     "accel-frontend", &accel_len);
+	if (IS_ERR(accel_frontend)) {
+		accel_frontend = NULL;
+		netfront_remove_accelerator(np, np->xbdev);
+	} else {
+		/* If this is the first time, request the accelerator,
+		   otherwise only request one if it has changed */
+		if (vif_state->accel_frontend == NULL) {
+			rc = netfront_load_accelerator(np, np->xbdev, 
+						       accel_frontend);
+		} else {
+			if (strncmp(vif_state->accel_frontend, accel_frontend,
+				    accel_len)) {
+				netfront_remove_accelerator(np, np->xbdev);
+				rc = netfront_load_accelerator(np, np->xbdev, 
+							       accel_frontend);
+			}
+		}
+	}
+
+	/* Get rid of previous state and replace with the new name */
+	if (vif_state->accel_frontend != NULL)
+		kfree(vif_state->accel_frontend);
+	vif_state->accel_frontend = accel_frontend;
+
+	mutex_unlock(&accelerator_mutex);
+
+	if (rc == 0) {
+		DPRINTK("requesting module %s\n", accel_frontend);
+		request_module("%s", accel_frontend);
+		/*
+		 * Module should now call netfront_accelerator_loaded() once
+		 * it's up and running, and we can continue from there 
+		 */
+	}
+}
+
+
+static void accel_watch_changed(struct xenbus_watch *watch,
+				const char **vec, unsigned int len)
+{
+	struct netfront_accel_vif_state *vif_state = 
+		container_of(watch, struct netfront_accel_vif_state,
+			     accel_watch);
+	queue_work(accel_watch_workqueue, &vif_state->accel_work);
+}
+
+
+void netfront_accelerator_add_watch(struct netfront_info *np)
+{
+	int err;
+	
+	/* 
+	 * If old watch exists, e.g. from before suspend/resume,
+	 * remove it now 
+	 */
+	netfront_accelerator_remove_watch(np);
+
+	/* Get a watch on the accelerator plugin */
+	err = xenbus_watch_path2(np->xbdev, np->xbdev->otherend, 
+				 "accel-frontend", 
+				 &np->accel_vif_state.accel_watch,
+				 accel_watch_changed);
+	if (err) {
+		DPRINTK("%s: Failed to register accel watch: %d\n",
+                        __FUNCTION__, err);
+		np->accel_vif_state.accel_watch.node = NULL;
+        }
+}
+
+
+static void 
+netfront_accelerator_purge_watch(struct netfront_accel_vif_state *vif_state)
+{
+	flush_workqueue(accel_watch_workqueue);
+
+	/* Clean up any state left from watch */
+	if (vif_state->accel_frontend != NULL) {
+		kfree(vif_state->accel_frontend);
+		vif_state->accel_frontend = NULL;
+	}
+}
+
+
+static
+void netfront_accelerator_remove_watch(struct netfront_info *np)
+{
+	struct netfront_accel_vif_state *vif_state = &np->accel_vif_state;
+
+	/* Get rid of watch on accelerator plugin */
+	if (vif_state->accel_watch.node != NULL) {
+		unregister_xenbus_watch(&vif_state->accel_watch);
+		kfree(vif_state->accel_watch.node);
+		vif_state->accel_watch.node = NULL;
+
+		netfront_accelerator_purge_watch(vif_state);
+	}	
+}
+
+
+/* 
+ * Initialise the accel_vif_state field in the netfront state
+ */ 
+void init_accelerator_vif(struct netfront_info *np,
+			  struct xenbus_device *dev)
+{
+	np->accelerator = NULL;
+
+	/* It's assumed that these things don't change */
+	np->accel_vif_state.np = np;
+	np->accel_vif_state.dev = dev;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+	INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work);
+#else
+	INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work, 
+		  &np->accel_vif_state);
+#endif
+}
+
+
+/*
+ * Compare a frontend description string against an accelerator to see
+ * if they match.  Would ultimately be nice to replace the string with
+ * a unique numeric identifier for each accelerator.
+ */
+static int match_accelerator(const char *frontend, 
+			     struct netfront_accelerator *accelerator)
+{
+	return strcmp(frontend, accelerator->frontend) == 0;
+}
+
+
+/* 
+ * Add a frontend vif to the list of vifs that is using a netfront
+ * accelerator plugin module.  Must be called with the accelerator
+ * mutex held.
+ */
+static void add_accelerator_vif(struct netfront_accelerator *accelerator,
+				struct netfront_info *np)
+{
+	if (np->accelerator == NULL) {
+		np->accelerator = accelerator;
+		
+		list_add(&np->accel_vif_state.link, &accelerator->vif_states);
+	} else {
+		/* 
+		 * May get here legitimately if suspend_cancel is
+		 * called, but in that case configuration should not
+		 * have changed
+		 */
+		BUG_ON(np->accelerator != accelerator);
+	}
+}
+
+
+/*
+ * Initialise the state to track an accelerator plugin module.  
+ * 
+ * Must be called with the accelerator mutex held.
+ */ 
+static int init_accelerator(const char *frontend, 
+			    struct netfront_accelerator **result,
+			    struct netfront_accel_hooks *hooks)
+{
+	struct netfront_accelerator *accelerator = 
+		kmalloc(sizeof(struct netfront_accelerator), GFP_KERNEL);
+	int frontend_len;
+
+	if (!accelerator) {
+		DPRINTK("no memory for accelerator\n");
+		return -ENOMEM;
+	}
+
+	frontend_len = strlen(frontend) + 1;
+	accelerator->frontend = kmalloc(frontend_len, GFP_KERNEL);
+	if (!accelerator->frontend) {
+		DPRINTK("no memory for accelerator\n");
+		kfree(accelerator);
+		return -ENOMEM;
+	}
+	strlcpy(accelerator->frontend, frontend, frontend_len);
+	
+	INIT_LIST_HEAD(&accelerator->vif_states);
+	spin_lock_init(&accelerator->vif_states_lock);
+
+	accelerator->hooks = hooks;
+
+	list_add(&accelerator->link, &accelerators_list);
+
+	*result = accelerator;
+
+	return 0;
+}					
+
+
+/* 
+ * Modify the hooks stored in the per-vif state to match that in the
+ * netfront accelerator's state.
+ * 
+ * Takes the vif_states_lock spinlock and may sleep.
+ */
+static void 
+accelerator_set_vif_state_hooks(struct netfront_accel_vif_state *vif_state)
+{
+	struct netfront_accelerator *accelerator;
+	unsigned long flags;
+
+	DPRINTK("%p\n",vif_state);
+
+	/* Make sure there are no data path operations going on */
+	napi_disable(&vif_state->np->napi);
+	netif_tx_lock_bh(vif_state->np->netdev);
+
+	accelerator = vif_state->np->accelerator;
+	spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+	vif_state->hooks = accelerator->hooks;
+	spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+
+	netif_tx_unlock_bh(vif_state->np->netdev);
+	napi_enable(&vif_state->np->napi);
+}
+
+
+/* 
+ * Must be called with the accelerator mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static void accelerator_probe_new_vif(struct netfront_info *np,
+				      struct xenbus_device *dev, 
+				      struct netfront_accelerator *accelerator)
+{
+	struct netfront_accel_hooks *hooks;
+
+	DPRINTK("\n");
+
+	/* Include this frontend device on the accelerator's list */
+	add_accelerator_vif(accelerator, np);
+	
+	hooks = accelerator->hooks;
+	
+	if (hooks && hooks->new_device(np->netdev, dev) == 0)
+		accelerator_set_vif_state_hooks(&np->accel_vif_state);
+
+	return;
+}
+
+
+/*  
+ * Request that a particular netfront accelerator plugin is loaded.
+ * Usually called as a result of the vif configuration specifying
+ * which one to use.
+ *
+ * Must be called with accelerator_mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static int netfront_load_accelerator(struct netfront_info *np, 
+				     struct xenbus_device *dev, 
+				     const char *frontend)
+{
+	struct netfront_accelerator *accelerator;
+	int rc = 0;
+
+	DPRINTK(" %s\n", frontend);
+
+	/* 
+	 * Look at list of loaded accelerators to see if the requested
+	 * one is already there 
+	 */
+	list_for_each_entry(accelerator, &accelerators_list, link) {
+		if (match_accelerator(frontend, accelerator)) {
+			accelerator_probe_new_vif(np, dev, accelerator);
+			return 0;
+		}
+	}
+
+	/* Couldn't find it, so create a new one and load the module */
+	if ((rc = init_accelerator(frontend, &accelerator, NULL)) < 0) {
+		return rc;
+	}
+
+	/* Include this frontend device on the accelerator's list */
+	add_accelerator_vif(accelerator, np);
+
+	return rc;
+}
+
+
+/*
+ * Go through all the netfront vifs and see if they have requested
+ * this accelerator.  Notify the accelerator plugin of the relevant
+ * device if so.  Called when an accelerator plugin module is first
+ * loaded and connects to netfront.
+ *
+ * Must be called with accelerator_mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static void 
+accelerator_probe_vifs(struct netfront_accelerator *accelerator,
+		       struct netfront_accel_hooks *hooks)
+{
+	struct netfront_accel_vif_state *vif_state, *tmp;
+
+	DPRINTK("%p\n", accelerator);
+
+	/* 
+	 * Store the hooks for future calls to probe a new device, and
+	 * to wire into the vif_state once the accelerator plugin is
+	 * ready to accelerate each vif
+	 */
+	BUG_ON(hooks == NULL);
+	accelerator->hooks = hooks;
+
+	/* Holds accelerator_mutex to iterate list */
+	list_for_each_entry_safe(vif_state, tmp, &accelerator->vif_states,
+				 link) {
+		struct netfront_info *np = vif_state->np;
+		
+		if (hooks->new_device(np->netdev, vif_state->dev) == 0)
+			accelerator_set_vif_state_hooks(vif_state);
+	}
+}
+
+
+/* 
+ * Called by the netfront accelerator plugin module when it has
+ * loaded.
+ *
+ * Takes the accelerator_mutex and vif_states_lock spinlock.
+ */
+int netfront_accelerator_loaded(int version, const char *frontend, 
+				struct netfront_accel_hooks *hooks)
+{
+	struct netfront_accelerator *accelerator;
+
+	if (is_initial_xendomain())
+		return -EINVAL;
+
+	if (version != NETFRONT_ACCEL_VERSION) {
+		if (version > NETFRONT_ACCEL_VERSION) {
+			/* Caller has higher version number, leave it
+			   up to them to decide whether to continue.
+			   They can re-call with a lower number if
+			   they're happy to be compatible with us */
+			return NETFRONT_ACCEL_VERSION;
+		} else {
+			/* We have a more recent version than caller.
+			   Currently reject, but may in future be able
+			   to be backwardly compatible */
+			return -EPROTO;
+		}
+	}
+
+	mutex_lock(&accelerator_mutex);
+
+	/* 
+	 * Look through list of accelerators to see if it has already
+	 * been requested
+	 */
+	list_for_each_entry(accelerator, &accelerators_list, link) {
+		if (match_accelerator(frontend, accelerator)) {
+			accelerator_probe_vifs(accelerator, hooks);
+			goto out;
+		}
+	}
+
+	/*
+	 * If it wasn't in the list, add it now so that when it is
+	 * requested the caller will find it
+	 */
+	DPRINTK("Couldn't find matching accelerator (%s)\n",
+		frontend);
+
+	init_accelerator(frontend, &accelerator, hooks);
+
+ out:
+	mutex_unlock(&accelerator_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(netfront_accelerator_loaded);
+
+
+/* 
+ * Remove the hooks from a single vif state.
+ * 
+ * Takes the vif_states_lock spinlock and may sleep.
+ */
+static void 
+accelerator_remove_single_hook(struct netfront_accelerator *accelerator,
+			       struct netfront_accel_vif_state *vif_state)
+{
+	unsigned long flags;
+
+	/* Make sure there are no data path operations going on */
+	napi_disable(&vif_state->np->napi);
+	netif_tx_lock_bh(vif_state->np->netdev);
+
+	spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+
+	/* 
+	 * Remove the hooks, but leave the vif_state on the
+	 * accelerator's list as that signifies this vif is
+	 * interested in using that accelerator if it becomes
+	 * available again
+	 */
+	vif_state->hooks = NULL;
+	
+	spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+
+	netif_tx_unlock_bh(vif_state->np->netdev);
+	napi_enable(&vif_state->np->napi);
+}
+
+
+/* 
+ * Safely remove the accelerator function hooks from a netfront state.
+ * 
+ * Must be called with the accelerator mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static void accelerator_remove_hooks(struct netfront_accelerator *accelerator)
+{
+	struct netfront_accel_vif_state *vif_state, *tmp;
+	unsigned long flags;
+
+	/* Mutex is held to iterate list */
+	list_for_each_entry_safe(vif_state, tmp,
+				 &accelerator->vif_states,
+				 link) {
+		if(vif_state->hooks) {
+			spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+
+			/* Last chance to get statistics from the accelerator */
+			vif_state->hooks->get_stats(vif_state->np->netdev,
+						    &vif_state->np->netdev->stats,
+						    this_cpu_ptr(vif_state->np->stats));
+
+			spin_unlock_irqrestore(&accelerator->vif_states_lock,
+					       flags);
+
+			accelerator_remove_single_hook(accelerator, vif_state);
+
+			accelerator->hooks->remove(vif_state->dev);
+		}
+	}
+	
+	accelerator->hooks = NULL;
+}
+
+
+/* 
+ * Called by a netfront accelerator when it is unloaded.  This safely
+ * removes the hooks into the plugin and blocks until all devices have
+ * finished using it, so on return it is safe to unload.
+ *
+ * Takes the accelerator mutex, and vif_states_lock spinlock.
+ */
+void netfront_accelerator_stop(const char *frontend)
+{
+	struct netfront_accelerator *accelerator;
+
+	mutex_lock(&accelerator_mutex);
+
+	list_for_each_entry(accelerator, &accelerators_list, link) {
+		if (match_accelerator(frontend, accelerator)) {
+			accelerator_remove_hooks(accelerator);
+			goto out;
+		}
+	}
+ out:
+	mutex_unlock(&accelerator_mutex);
+}
+EXPORT_SYMBOL_GPL(netfront_accelerator_stop);
+
+
+/* 
+ * Helper for call_remove and do_suspend
+ * 
+ * Must be called with the accelerator mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static int do_remove(struct netfront_info *np, struct xenbus_device *dev)
+{
+	struct netfront_accelerator *accelerator = np->accelerator;
+	unsigned long flags;
+ 	int rc = 0;
+ 
+	if (np->accel_vif_state.hooks) {
+		spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+
+		/* Last chance to get statistics from the accelerator */
+		np->accel_vif_state.hooks->get_stats(np->netdev,
+						     &np->netdev->stats,
+						     this_cpu_ptr(np->stats));
+
+		spin_unlock_irqrestore(&accelerator->vif_states_lock, 
+				       flags);
+
+ 		/* 
+ 		 * Try and do the opposite of accelerator_probe_new_vif
+ 		 * to ensure there's no state pointing back at the 
+ 		 * netdev 
+ 		 */
+		accelerator_remove_single_hook(accelerator, 
+ 					       &np->accel_vif_state);
+
+		rc = accelerator->hooks->remove(dev);
+	}
+ 
+ 	return rc;
+}
+
+
+/*
+ * Must be called with the accelerator mutex held.  Takes the
+ * vif_states_lock spinlock
+ */
+static int netfront_remove_accelerator(struct netfront_info *np,
+				       struct xenbus_device *dev)
+{
+	struct netfront_accelerator *accelerator;
+ 	struct netfront_accel_vif_state *tmp_vif_state;
+	int rc = 0; 
+
+ 	/* Check that we've got a device that was accelerated */
+ 	if (np->accelerator == NULL)
+		return rc;
+
+	accelerator = np->accelerator;
+
+	list_for_each_entry(tmp_vif_state, &accelerator->vif_states,
+			    link) {
+		if (tmp_vif_state == &np->accel_vif_state) {
+			list_del(&np->accel_vif_state.link);
+			break;
+		}
+	}
+
+	rc = do_remove(np, dev);
+
+	np->accelerator = NULL;
+
+	return rc;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the accelerator mutex and the
+ * vif_states_lock spinlock.
+ */
+int netfront_accelerator_call_remove(struct netfront_info *np,
+				     struct xenbus_device *dev)
+{
+	int rc;
+	netfront_accelerator_remove_watch(np);
+	mutex_lock(&accelerator_mutex);
+	rc = netfront_remove_accelerator(np, dev);
+	mutex_unlock(&accelerator_mutex);
+	return rc;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the accelerator mutex and the
+ * vif_states_lock spinlock.
+ */
+int netfront_accelerator_suspend(struct netfront_info *np,
+ 				 struct xenbus_device *dev)
+{
+	int rc = 0;
+	
+	mutex_lock(&accelerator_mutex);
+
+ 	/* Check that we've got a device that was accelerated */
+ 	if (np->accelerator == NULL)
+		goto out;
+
+	/* 
+	 * Call the remove accelerator hook, but leave the vif_state
+	 * on the accelerator's list in case there is a suspend_cancel.
+	 */
+	rc = do_remove(np, dev);
+ out:
+	mutex_unlock(&accelerator_mutex);
+	return rc;
+}
+  
+  
+int netfront_accelerator_suspend_cancel(struct netfront_info *np,
+ 					struct xenbus_device *dev)
+{
+	netfront_accelerator_purge_watch(&np->accel_vif_state);
+
+	/* 
+	 * Gratuitously fire the watch handler to reinstate the
+	 * configured accelerator
+	 */
+	if (dev->state == XenbusStateConnected)
+		queue_work(accel_watch_workqueue, 
+			   &np->accel_vif_state.accel_work);
+
+	return 0;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the accelerator mutex
+ */
+void netfront_accelerator_resume(struct netfront_info *np,
+ 				 struct xenbus_device *dev)
+{
+ 	struct netfront_accel_vif_state *accel_vif_state = NULL;
+
+ 	mutex_lock(&accelerator_mutex);
+
+	/* Check that we've got a device that was accelerated */
+ 	if(np->accelerator == NULL)
+		goto out;
+
+ 	/* Find the vif_state from the accelerator's list */
+ 	list_for_each_entry(accel_vif_state, &np->accelerator->vif_states, 
+ 			    link) {
+ 		if (accel_vif_state->dev == dev) {
+ 			BUG_ON(accel_vif_state != &np->accel_vif_state);
+ 
+ 			/* 
+ 			 * Remove it from the accelerator's list so
+ 			 * state is consistent for probing new vifs
+ 			 * when they get connected
+ 			 */
+ 			list_del(&accel_vif_state->link);
+ 			np->accelerator = NULL;
+
+			break;
+ 		}
+ 	}
+
+ out:
+	mutex_unlock(&accelerator_mutex);
+	return;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the vif_states_lock spinlock
+ */
+int netfront_check_accelerator_queue_ready(struct net_device *dev,
+					   struct netfront_info *np)
+{
+	struct netfront_accelerator *accelerator;
+	int rc = 1;
+	unsigned long flags;
+
+	accelerator = np->accelerator;
+
+	/* Call the check_ready accelerator hook. */ 
+	if (np->accel_vif_state.hooks && accelerator) {
+		spin_lock_irqsave(&accelerator->vif_states_lock, flags); 
+		if (np->accel_vif_state.hooks &&
+		    np->accelerator == accelerator)
+			rc = np->accel_vif_state.hooks->check_ready(dev);
+		spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+	}
+
+	return rc;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the vif_states_lock spinlock
+ */
+void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np,
+					     struct net_device *dev)
+{
+	struct netfront_accelerator *accelerator;
+	unsigned long flags;
+
+	accelerator = np->accelerator;
+
+	/* Call the stop_napi_interrupts accelerator hook. */
+	if (np->accel_vif_state.hooks && accelerator != NULL) {
+		spin_lock_irqsave(&accelerator->vif_states_lock, flags); 
+		if (np->accel_vif_state.hooks &&
+		    np->accelerator == accelerator)
+ 			np->accel_vif_state.hooks->stop_napi_irq(dev);
+		spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+	}
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the vif_states_lock spinlock
+ */
+int netfront_accelerator_call_get_stats(struct netfront_info *np,
+					struct net_device *dev)
+{
+	struct netfront_accelerator *accelerator;
+	unsigned long flags;
+	int rc = 0;
+
+	accelerator = np->accelerator;
+
+	/* Call the get_stats accelerator hook. */
+	if (np->accel_vif_state.hooks && accelerator != NULL) {
+		spin_lock_irqsave(&accelerator->vif_states_lock, flags); 
+		if (np->accel_vif_state.hooks && 
+		    np->accelerator == accelerator)
+ 			rc = np->accel_vif_state.hooks->get_stats(dev, &dev->stats,
+								  this_cpu_ptr(np->stats));
+		spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+	}
+	return rc;
+}
+
diff --git a/drivers/xen/netfront/netfront.c b/drivers/xen/netfront/netfront.c
new file mode 100644
index 0000000..1cbe30d
--- /dev/null
+++ b/drivers/xen/netfront/netfront.c
@@ -0,0 +1,2274 @@
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/ethtool.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/io.h>
+#include <linux/moduleparam.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/route.h>
+#include <asm/uaccess.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/netif.h>
+#include <xen/interface/memory.h>
+#include <xen/balloon.h>
+#include <asm/page.h>
+#include <asm/maddr.h>
+#include <asm/uaccess.h>
+#include <xen/interface/grant_table.h>
+#include <xen/gnttab.h>
+#include <xen/net-util.h>
+
+struct netfront_cb {
+	struct page *page;
+	unsigned offset;
+};
+
+#define NETFRONT_SKB_CB(skb)	((struct netfront_cb *)((skb)->cb))
+
+#include "netfront.h"
+
+/*
+ * Mutually-exclusive module options to select receive data path:
+ *  rx_copy : Packets are copied by network backend into local memory
+ *  rx_flip : Page containing packet data is transferred to our ownership
+ * For fully-virtualised guests there is no option - copying must be used.
+ * For paravirtualised guests, flipping is the default.
+ */
+#ifdef CONFIG_XEN
+static bool MODPARM_rx_copy;
+module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
+MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
+static bool MODPARM_rx_flip;
+module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
+MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
+#else
+# define MODPARM_rx_copy true
+# define MODPARM_rx_flip false
+#endif
+
+#define RX_COPY_THRESHOLD 256
+
+/* If we don't have GSO, fake things up so that we never try to use it. */
+#if defined(NETIF_F_GSO)
+#define HAVE_GSO			1
+#define HAVE_TSO			1 /* TSO is a subset of GSO */
+#define HAVE_CSUM_OFFLOAD		1
+static inline void dev_disable_gso_features(struct net_device *dev)
+{
+	/* Turn off all GSO bits except ROBUST. */
+	dev->features &= ~NETIF_F_GSO_MASK;
+	dev->features |= NETIF_F_GSO_ROBUST;
+}
+#elif defined(NETIF_F_TSO)
+#define HAVE_GSO		       0
+#define HAVE_TSO                       1
+
+/* Some older kernels cannot cope with incorrect checksums,
+ * particularly in netfilter. I'm not sure there is 100% correlation
+ * with the presence of NETIF_F_TSO but it appears to be a good first
+ * approximiation.
+ */
+#define HAVE_CSUM_OFFLOAD              0
+
+#define gso_size tso_size
+#define gso_segs tso_segs
+static inline void dev_disable_gso_features(struct net_device *dev)
+{
+       /* Turn off all TSO bits. */
+       dev->features &= ~NETIF_F_TSO;
+}
+static inline int skb_is_gso(const struct sk_buff *skb)
+{
+        return skb_shinfo(skb)->tso_size;
+}
+static inline int skb_gso_ok(struct sk_buff *skb, int features)
+{
+        return (features & NETIF_F_TSO);
+}
+
+#define netif_skb_features(skb) ((skb)->dev->features)
+static inline int netif_needs_gso(struct sk_buff *skb, int features)
+{
+        return skb_is_gso(skb) &&
+               (!skb_gso_ok(skb, features) ||
+                unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
+}
+#else
+#define HAVE_GSO			0
+#define HAVE_TSO			0
+#define HAVE_CSUM_OFFLOAD		0
+#define netif_needs_gso(skb, feat)	0
+#define dev_disable_gso_features(dev)	((void)0)
+#define ethtool_op_set_tso(dev, data)	(-ENOSYS)
+#endif
+
+#define GRANT_INVALID_REF	0
+
+struct netfront_rx_info {
+	struct netif_rx_response rx;
+	struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+};
+
+/*
+ * Implement our own carrier flag: the network stack's version causes delays
+ * when the carrier is re-enabled (in particular, dev_activate() may not
+ * immediately be called, which can cause packet loss).
+ */
+#define netfront_carrier_on(netif)	((netif)->carrier = 1)
+#define netfront_carrier_off(netif)	((netif)->carrier = 0)
+#define netfront_carrier_ok(netif)	((netif)->carrier)
+
+/*
+ * Access macros for acquiring freeing slots in tx_skbs[].
+ */
+
+static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
+{
+	list[id] = list[0];
+	list[0]  = (void *)(unsigned long)id;
+}
+
+static inline unsigned short get_id_from_freelist(struct sk_buff **list)
+{
+	unsigned int id = (unsigned int)(unsigned long)list[0];
+	list[0] = list[id];
+	return id;
+}
+
+static inline int xennet_rxidx(RING_IDX idx)
+{
+	return idx & (NET_RX_RING_SIZE - 1);
+}
+
+static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
+						RING_IDX ri)
+{
+	int i = xennet_rxidx(ri);
+	struct sk_buff *skb = np->rx_skbs[i];
+	np->rx_skbs[i] = NULL;
+	return skb;
+}
+
+static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
+					    RING_IDX ri)
+{
+	int i = xennet_rxidx(ri);
+	grant_ref_t ref = np->grant_rx_ref[i];
+	np->grant_rx_ref[i] = GRANT_INVALID_REF;
+	return ref;
+}
+
+#define DPRINTK(fmt, args...)				\
+	pr_debug("netfront (%s:%d) " fmt,		\
+		 __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...) pr_info("netfront: " fmt, ##args)
+#define WPRINTK(fmt, args...) pr_warning("netfront: " fmt, ##args)
+
+static int setup_device(struct xenbus_device *, struct netfront_info *);
+static struct net_device *create_netdev(struct xenbus_device *);
+
+static void end_access(int, void *);
+static void netif_release_rings(struct netfront_info *);
+static void netif_disconnect_backend(struct netfront_info *);
+
+static int network_connect(struct net_device *);
+static void network_tx_buf_gc(struct net_device *);
+static void network_alloc_rx_buffers(struct net_device *);
+
+static irqreturn_t netif_int(int irq, void *dev_id);
+
+#ifdef CONFIG_SYSFS
+static int xennet_sysfs_addif(struct net_device *netdev);
+static void xennet_sysfs_delif(struct net_device *netdev);
+#else /* !CONFIG_SYSFS */
+#define xennet_sysfs_addif(dev) (0)
+#define xennet_sysfs_delif(dev) do { } while(0)
+#endif
+
+static inline bool xennet_can_sg(struct net_device *dev)
+{
+	return dev->features & NETIF_F_SG;
+}
+
+/*
+ * Work around net.ipv4.conf.*.arp_notify not being enabled by default.
+ */
+static void __devinit netfront_enable_arp_notify(struct netfront_info *info)
+{
+#ifdef CONFIG_INET
+	struct in_device *in_dev;
+
+	rtnl_lock();
+	in_dev = __in_dev_get_rtnl(info->netdev);
+	if (in_dev && !IN_DEV_CONF_GET(in_dev, ARP_NOTIFY))
+		IN_DEV_CONF_SET(in_dev, ARP_NOTIFY, 1);
+	rtnl_unlock();
+	if (!in_dev)
+		printk(KERN_WARNING "Cannot enable ARP notification on %s\n",
+		       info->xbdev->nodename);
+#endif
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffers for communication with the backend, and
+ * inform the backend of the appropriate details for those.
+ */
+static int __devinit netfront_probe(struct xenbus_device *dev,
+				    const struct xenbus_device_id *id)
+{
+	int err;
+	struct net_device *netdev;
+	struct netfront_info *info;
+
+	netdev = create_netdev(dev);
+	if (IS_ERR(netdev)) {
+		err = PTR_ERR(netdev);
+		xenbus_dev_fatal(dev, err, "creating netdev");
+		return err;
+	}
+
+	info = netdev_priv(netdev);
+	dev_set_drvdata(&dev->dev, info);
+
+	err = register_netdev(info->netdev);
+	if (err) {
+		pr_warning("%s: register_netdev err=%d\n",
+			   __FUNCTION__, err);
+		goto fail;
+	}
+
+	netfront_enable_arp_notify(info);
+
+	err = xennet_sysfs_addif(info->netdev);
+	if (err) {
+		unregister_netdev(info->netdev);
+		pr_warning("%s: add sysfs failed err=%d\n",
+			   __FUNCTION__, err);
+		goto fail;
+	}
+
+	return 0;
+
+ fail:
+	free_netdev(netdev);
+	dev_set_drvdata(&dev->dev, NULL);
+	return err;
+}
+
+static int __devexit netfront_remove(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("%s\n", dev->nodename);
+
+	netfront_accelerator_call_remove(info, dev);
+
+	netif_disconnect_backend(info);
+
+	del_timer_sync(&info->rx_refill_timer);
+
+	xennet_sysfs_delif(info->netdev);
+
+	unregister_netdev(info->netdev);
+
+	free_percpu(info->stats);
+
+	free_netdev(info->netdev);
+
+	return 0;
+}
+
+
+static int netfront_suspend(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev_get_drvdata(&dev->dev);
+	return netfront_accelerator_suspend(info, dev);
+}
+
+
+static int netfront_suspend_cancel(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev_get_drvdata(&dev->dev);
+	return netfront_accelerator_suspend_cancel(info, dev);
+}
+
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our netif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int netfront_resume(struct xenbus_device *dev)
+{
+	struct netfront_info *info = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("%s\n", dev->nodename);
+
+	netfront_accelerator_resume(info, dev);
+
+	netif_disconnect_backend(info);
+	return 0;
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+	char *s, *e, *macstr;
+	int i;
+
+	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+	if (IS_ERR(macstr))
+		return PTR_ERR(macstr);
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac[i] = simple_strtoul(s, &e, 16);
+		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+			kfree(macstr);
+			return -ENOENT;
+		}
+		s = e+1;
+	}
+
+	kfree(macstr);
+	return 0;
+}
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+			   struct netfront_info *info)
+{
+	const char *message;
+	struct xenbus_transaction xbt;
+	int err;
+
+	/* Read mac only in the first setup. */
+	if (!is_valid_ether_addr(info->mac)) {
+		err = xen_net_read_mac(dev, info->mac);
+		if (err) {
+			xenbus_dev_fatal(dev, err, "parsing %s/mac",
+					 dev->nodename);
+			goto out;
+		}
+	}
+
+	/* Create shared ring, alloc event channel. */
+	err = setup_device(dev, info);
+	if (err)
+		goto out;
+
+	/* This will load an accelerator if one is configured when the
+	 * watch fires */
+	netfront_accelerator_add_watch(info);
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_ring;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
+			    info->tx_ring_ref);
+	if (err) {
+		message = "writing tx ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
+			    info->rx_ring_ref);
+	if (err) {
+		message = "writing rx ring-ref";
+		goto abort_transaction;
+	}
+	err = xenbus_printf(xbt, dev->nodename,
+			    "event-channel", "%u",
+			    irq_to_evtchn_port(info->irq));
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
+			    info->copying_receiver);
+	if (err) {
+		message = "writing request-rx-copy";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
+	if (err) {
+		message = "writing feature-rx-notify";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload",
+			    "%d", !HAVE_CSUM_OFFLOAD);
+	if (err) {
+		message = "writing feature-no-csum-offload";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
+	if (err) {
+		message = "writing feature-sg";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d",
+			    HAVE_TSO);
+	if (err) {
+		message = "writing feature-gso-tcpv4";
+		goto abort_transaction;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_ring;
+	}
+
+	return 0;
+
+ abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_ring:
+	netfront_accelerator_call_remove(info, dev);
+	netif_disconnect_backend(info);
+ out:
+	return err;
+}
+
+static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
+{
+	struct netif_tx_sring *txs;
+	struct netif_rx_sring *rxs;
+	int err;
+	struct net_device *netdev = info->netdev;
+
+	info->tx_ring_ref = GRANT_INVALID_REF;
+	info->rx_ring_ref = GRANT_INVALID_REF;
+	info->rx.sring = NULL;
+	info->tx.sring = NULL;
+	info->irq = 0;
+
+	txs = (struct netif_tx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
+	if (!txs) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(dev, err, "allocating tx ring page");
+		goto fail;
+	}
+	SHARED_RING_INIT(txs);
+	FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(txs));
+	if (err < 0) {
+		free_page((unsigned long)txs);
+		goto fail;
+	}
+	info->tx_ring_ref = err;
+
+	rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
+	if (!rxs) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(dev, err, "allocating rx ring page");
+		goto fail;
+	}
+	SHARED_RING_INIT(rxs);
+	FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
+	if (err < 0) {
+		free_page((unsigned long)rxs);
+		goto fail;
+	}
+	info->rx_ring_ref = err;
+
+	memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
+
+	err = bind_listening_port_to_irqhandler(
+		dev->otherend_id, netif_int, 0, netdev->name, netdev);
+	if (err < 0)
+		goto fail;
+	info->irq = err;
+
+	return 0;
+
+ fail:
+	netif_release_rings(info);
+	return err;
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+			    enum xenbus_state backend_state)
+{
+	struct netfront_info *np = dev_get_drvdata(&dev->dev);
+	struct net_device *netdev = np->netdev;
+
+	DPRINTK("%s\n", xenbus_strstate(backend_state));
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateInitWait:
+		if (dev->state != XenbusStateInitialising)
+			break;
+		if (network_connect(netdev) != 0)
+			break;
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateConnected:
+		netif_notify_peers(netdev);
+		break;
+
+	case XenbusStateClosing:
+		xenbus_frontend_closed(dev);
+		break;
+	}
+}
+
+static inline int netfront_tx_slot_available(struct netfront_info *np)
+{
+	return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
+		(TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
+}
+
+
+static inline void network_maybe_wake_tx(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+
+	if (unlikely(netif_queue_stopped(dev)) &&
+	    netfront_tx_slot_available(np) &&
+	    likely(netif_running(dev)) &&
+	    netfront_check_accelerator_queue_ready(dev, np))
+		netif_wake_queue(dev);
+}
+
+
+int netfront_check_queue_ready(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+
+	return unlikely(netif_queue_stopped(dev)) &&
+		netfront_tx_slot_available(np) &&
+		likely(netif_running(dev));
+}
+EXPORT_SYMBOL(netfront_check_queue_ready);
+
+static int network_open(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+
+	napi_enable(&np->napi);
+
+	spin_lock_bh(&np->rx_lock);
+	if (netfront_carrier_ok(np)) {
+		network_alloc_rx_buffers(dev);
+		np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
+		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){
+			netfront_accelerator_call_stop_napi_irq(np, dev);
+
+			napi_schedule(&np->napi);
+		}
+	}
+	spin_unlock_bh(&np->rx_lock);
+
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static void network_tx_buf_gc(struct net_device *dev)
+{
+	RING_IDX cons, prod;
+	unsigned short id;
+	struct netfront_info *np = netdev_priv(dev);
+	struct sk_buff *skb;
+
+	BUG_ON(!netfront_carrier_ok(np));
+
+	do {
+		prod = np->tx.sring->rsp_prod;
+		rmb(); /* Ensure we see responses up to 'rp'. */
+
+		for (cons = np->tx.rsp_cons; cons != prod; cons++) {
+			struct netif_tx_response *txrsp;
+
+			txrsp = RING_GET_RESPONSE(&np->tx, cons);
+			if (txrsp->status == XEN_NETIF_RSP_NULL)
+				continue;
+
+			id  = txrsp->id;
+			skb = np->tx_skbs[id];
+			if (unlikely(gnttab_query_foreign_access(
+				np->grant_tx_ref[id]) != 0)) {
+				pr_alert("network_tx_buf_gc: grant still"
+					 " in use by backend domain\n");
+				BUG();
+			}
+			gnttab_end_foreign_access_ref(np->grant_tx_ref[id]);
+			gnttab_release_grant_reference(
+				&np->gref_tx_head, np->grant_tx_ref[id]);
+			np->grant_tx_ref[id] = GRANT_INVALID_REF;
+			add_id_to_freelist(np->tx_skbs, id);
+			dev_kfree_skb_irq(skb);
+		}
+
+		np->tx.rsp_cons = prod;
+
+		/*
+		 * Set a new event, then check for race with update of tx_cons.
+		 * Note that it is essential to schedule a callback, no matter
+		 * how few buffers are pending. Even if there is space in the
+		 * transmit ring, higher layers may be blocked because too much
+		 * data is outstanding: in such cases notification from Xen is
+		 * likely to be the only kick that we'll get.
+		 */
+		np->tx.sring->rsp_event =
+			prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
+		mb();
+	} while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
+
+	network_maybe_wake_tx(dev);
+}
+
+static void rx_refill_timeout(unsigned long data)
+{
+	struct net_device *dev = (struct net_device *)data;
+	struct netfront_info *np = netdev_priv(dev);
+
+	netfront_accelerator_call_stop_napi_irq(np, dev);
+
+	napi_schedule(&np->napi);
+}
+
+static void network_alloc_rx_buffers(struct net_device *dev)
+{
+	unsigned short id;
+	struct netfront_info *np = netdev_priv(dev);
+	struct sk_buff *skb;
+	struct page *page;
+	int i, batch_target, notify;
+	RING_IDX req_prod = np->rx.req_prod_pvt;
+	struct xen_memory_reservation reservation;
+	grant_ref_t ref;
+ 	unsigned long pfn;
+ 	void *vaddr;
+	int nr_flips;
+	netif_rx_request_t *req;
+
+	if (unlikely(!netfront_carrier_ok(np)))
+		return;
+
+	/*
+	 * Allocate skbuffs greedily, even though we batch updates to the
+	 * receive ring. This creates a less bursty demand on the memory
+	 * allocator, so should reduce the chance of failed allocation requests
+	 * both for ourself and for other kernel subsystems.
+	 */
+	batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
+	for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
+		/*
+		 * Allocate an skb and a page. Do not use __dev_alloc_skb as
+		 * that will allocate page-sized buffers which is not
+		 * necessary here.
+		 * 16 bytes added as necessary headroom for netif_receive_skb.
+		 */
+		skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN,
+				GFP_ATOMIC | __GFP_NOWARN);
+		if (unlikely(!skb))
+			goto no_skb;
+
+		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+		if (!page) {
+			kfree_skb(skb);
+no_skb:
+			/* Any skbuffs queued for refill? Force them out. */
+			if (i != 0)
+				goto refill;
+			/* Could not allocate any skbuffs. Try again later. */
+			mod_timer(&np->rx_refill_timer,
+				  jiffies + (HZ/10));
+			break;
+		}
+
+		skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */
+		__skb_fill_page_desc(skb, 0, page, 0, 0);
+		skb_shinfo(skb)->nr_frags = 1;
+		__skb_queue_tail(&np->rx_batch, skb);
+	}
+
+	/* Is the batch large enough to be worthwhile? */
+	if (i < (np->rx_target/2)) {
+		if (req_prod > np->rx.sring->req_prod)
+			goto push;
+		return;
+	}
+
+	/* Adjust our fill target if we risked running out of buffers. */
+	if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
+	    ((np->rx_target *= 2) > np->rx_max_target))
+		np->rx_target = np->rx_max_target;
+
+ refill:
+	for (nr_flips = i = 0; ; i++) {
+		if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
+			break;
+
+		skb->dev = dev;
+
+		id = xennet_rxidx(req_prod + i);
+
+		BUG_ON(np->rx_skbs[id]);
+		np->rx_skbs[id] = skb;
+
+		ref = gnttab_claim_grant_reference(&np->gref_rx_head);
+		BUG_ON((signed short)ref < 0);
+		np->grant_rx_ref[id] = ref;
+
+		page = skb_frag_page(skb_shinfo(skb)->frags);
+		pfn = page_to_pfn(page);
+		vaddr = page_address(page);
+
+		req = RING_GET_REQUEST(&np->rx, req_prod + i);
+		if (!np->copying_receiver) {
+			gnttab_grant_foreign_transfer_ref(ref,
+							  np->xbdev->otherend_id,
+							  pfn);
+			np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn);
+			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+				/* Remove this page before passing
+				 * back to Xen. */
+				set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+				MULTI_update_va_mapping(np->rx_mcl+i,
+							(unsigned long)vaddr,
+							__pte(0), 0);
+			}
+			nr_flips++;
+		} else {
+			gnttab_grant_foreign_access_ref(ref,
+							np->xbdev->otherend_id,
+							pfn_to_mfn(pfn),
+							0);
+		}
+
+		req->id = id;
+		req->gref = ref;
+	}
+
+	if ( nr_flips != 0 ) {
+		/* Tell the ballon driver what is going on. */
+		balloon_update_driver_allowance(i);
+
+		set_xen_guest_handle(reservation.extent_start,
+				     np->rx_pfn_array);
+		reservation.nr_extents   = nr_flips;
+		reservation.extent_order = 0;
+		reservation.address_bits = 0;
+		reservation.domid        = DOMID_SELF;
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* After all PTEs have been zapped, flush the TLB. */
+			np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
+				UVMF_TLB_FLUSH|UVMF_ALL;
+
+			/* Give away a batch of pages. */
+			np->rx_mcl[i].op = __HYPERVISOR_memory_op;
+			np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
+			np->rx_mcl[i].args[1] = (unsigned long)&reservation;
+
+			/* Zap PTEs and give away pages in one big
+			 * multicall. */
+			if (unlikely(HYPERVISOR_multicall(np->rx_mcl, i+1)))
+				BUG();
+
+			/* Check return status of HYPERVISOR_memory_op(). */
+			if (unlikely(np->rx_mcl[i].result != i))
+				panic("Unable to reduce memory reservation\n");
+			while (nr_flips--)
+				BUG_ON(np->rx_mcl[nr_flips].result);
+		} else {
+			if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+						 &reservation) != i)
+				panic("Unable to reduce memory reservation\n");
+		}
+	} else {
+		wmb();
+	}
+
+	/* Above is a suitable barrier to ensure backend will see requests. */
+	np->rx.req_prod_pvt = req_prod + i;
+ push:
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
+	if (notify)
+		notify_remote_via_irq(np->irq);
+}
+
+static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
+			      struct netif_tx_request *tx)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	char *data = skb->data;
+	unsigned long mfn;
+	RING_IDX prod = np->tx.req_prod_pvt;
+	int frags = skb_shinfo(skb)->nr_frags;
+	unsigned int offset = offset_in_page(data);
+	unsigned int len = skb_headlen(skb);
+	unsigned int id;
+	grant_ref_t ref;
+	int i;
+
+	while (len > PAGE_SIZE - offset) {
+		tx->size = PAGE_SIZE - offset;
+		tx->flags |= XEN_NETTXF_more_data;
+		len -= tx->size;
+		data += tx->size;
+		offset = 0;
+
+		id = get_id_from_freelist(np->tx_skbs);
+		np->tx_skbs[id] = skb_get(skb);
+		tx = RING_GET_REQUEST(&np->tx, prod++);
+		tx->id = id;
+		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+		BUG_ON((signed short)ref < 0);
+
+		mfn = virt_to_mfn(data);
+		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+						mfn, GTF_readonly);
+
+		tx->gref = np->grant_tx_ref[id] = ref;
+		tx->offset = offset;
+		tx->size = len;
+		tx->flags = 0;
+	}
+
+	for (i = 0; i < frags; i++) {
+		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
+
+		tx->flags |= XEN_NETTXF_more_data;
+
+		id = get_id_from_freelist(np->tx_skbs);
+		np->tx_skbs[id] = skb_get(skb);
+		tx = RING_GET_REQUEST(&np->tx, prod++);
+		tx->id = id;
+		ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+		BUG_ON((signed short)ref < 0);
+
+		mfn = pfn_to_mfn(page_to_pfn(skb_frag_page(frag)));
+		gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+						mfn, GTF_readonly);
+
+		tx->gref = np->grant_tx_ref[id] = ref;
+		tx->offset = frag->page_offset;
+		tx->size = skb_frag_size(frag);
+		tx->flags = 0;
+	}
+
+	np->tx.req_prod_pvt = prod;
+}
+
+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	unsigned short id;
+	struct netfront_info *np = netdev_priv(dev);
+	struct netfront_stats *stats = this_cpu_ptr(np->stats);
+	struct netif_tx_request *tx;
+	struct netif_extra_info *extra;
+	char *data = skb->data;
+	RING_IDX i;
+	grant_ref_t ref;
+	unsigned long mfn;
+	int notify;
+	int frags = skb_shinfo(skb)->nr_frags;
+	unsigned int offset = offset_in_page(data);
+	unsigned int len = skb_headlen(skb);
+
+	/* Check the fast path, if hooks are available */
+ 	if (np->accel_vif_state.hooks && 
+ 	    np->accel_vif_state.hooks->start_xmit(skb, dev)) { 
+ 		/* Fast path has sent this packet */ 
+ 		return NETDEV_TX_OK;
+ 	} 
+
+	frags += DIV_ROUND_UP(offset + len, PAGE_SIZE);
+	if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
+		pr_alert("xennet: skb rides the rocket: %d frags\n", frags);
+		dump_stack();
+		goto drop;
+	}
+
+	spin_lock_irq(&np->tx_lock);
+
+	if (unlikely(!netfront_carrier_ok(np) ||
+		     (frags > 1 && !xennet_can_sg(dev)) ||
+		     netif_needs_gso(skb, netif_skb_features(skb)))) {
+		spin_unlock_irq(&np->tx_lock);
+		goto drop;
+	}
+
+	i = np->tx.req_prod_pvt;
+
+	id = get_id_from_freelist(np->tx_skbs);
+	np->tx_skbs[id] = skb;
+
+	tx = RING_GET_REQUEST(&np->tx, i);
+
+	tx->id   = id;
+	ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+	BUG_ON((signed short)ref < 0);
+	mfn = virt_to_mfn(data);
+	gnttab_grant_foreign_access_ref(
+		ref, np->xbdev->otherend_id, mfn, GTF_readonly);
+	tx->gref = np->grant_tx_ref[id] = ref;
+	tx->offset = offset;
+	tx->size = len;
+
+	tx->flags = 0;
+	extra = NULL;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+		tx->flags |= XEN_NETTXF_csum_blank | XEN_NETTXF_data_validated;
+	else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+		tx->flags |= XEN_NETTXF_data_validated;
+
+#if HAVE_TSO
+	if (skb_shinfo(skb)->gso_size) {
+		struct netif_extra_info *gso = (struct netif_extra_info *)
+			RING_GET_REQUEST(&np->tx, ++i);
+
+		if (extra)
+			extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
+		else
+			tx->flags |= XEN_NETTXF_extra_info;
+
+		gso->u.gso.size = skb_shinfo(skb)->gso_size;
+		gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+		gso->u.gso.pad = 0;
+		gso->u.gso.features = 0;
+
+		gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+		gso->flags = 0;
+		extra = gso;
+	}
+#endif
+
+	np->tx.req_prod_pvt = i + 1;
+
+	xennet_make_frags(skb, dev, tx);
+	tx->size = skb->len;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
+	if (notify)
+		notify_remote_via_irq(np->irq);
+
+	u64_stats_update_begin(&stats->syncp);
+	stats->tx_bytes += skb->len;
+	stats->tx_packets++;
+	u64_stats_update_end(&stats->syncp);
+	dev->trans_start = jiffies;
+
+	/* Note: It is not safe to access skb after network_tx_buf_gc()! */
+	network_tx_buf_gc(dev);
+
+	if (!netfront_tx_slot_available(np))
+		netif_stop_queue(dev);
+
+	spin_unlock_irq(&np->tx_lock);
+
+	return NETDEV_TX_OK;
+
+ drop:
+	dev->stats.tx_dropped++;
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static irqreturn_t netif_int(int irq, void *dev_id)
+{
+	struct net_device *dev = dev_id;
+	struct netfront_info *np = netdev_priv(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&np->tx_lock, flags);
+
+	if (likely(netfront_carrier_ok(np))) {
+		network_tx_buf_gc(dev);
+		/* Under tx_lock: protects access to rx shared-ring indexes. */
+		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
+			netfront_accelerator_call_stop_napi_irq(np, dev);
+
+			napi_schedule(&np->napi);
+		}
+	}
+
+	spin_unlock_irqrestore(&np->tx_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
+				grant_ref_t ref)
+{
+	int new = xennet_rxidx(np->rx.req_prod_pvt);
+
+	BUG_ON(np->rx_skbs[new]);
+	np->rx_skbs[new] = skb;
+	np->grant_rx_ref[new] = ref;
+	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
+	RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
+	np->rx.req_prod_pvt++;
+}
+
+int xennet_get_extras(struct netfront_info *np,
+		      struct netif_extra_info *extras, RING_IDX rp)
+
+{
+	struct netif_extra_info *extra;
+	RING_IDX cons = np->rx.rsp_cons;
+	int err = 0;
+
+	do {
+		struct sk_buff *skb;
+		grant_ref_t ref;
+
+		if (unlikely(cons + 1 == rp)) {
+			if (net_ratelimit())
+				WPRINTK("Missing extra info\n");
+			err = -EBADR;
+			break;
+		}
+
+		extra = (struct netif_extra_info *)
+			RING_GET_RESPONSE(&np->rx, ++cons);
+
+		if (unlikely(!extra->type ||
+			     extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+			if (net_ratelimit())
+				WPRINTK("Invalid extra type: %d\n",
+					extra->type);
+			err = -EINVAL;
+		} else {
+			memcpy(&extras[extra->type - 1], extra,
+			       sizeof(*extra));
+		}
+
+		skb = xennet_get_rx_skb(np, cons);
+		ref = xennet_get_rx_ref(np, cons);
+		xennet_move_rx_slot(np, skb, ref);
+	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+	np->rx.rsp_cons = cons;
+	return err;
+}
+
+static int xennet_get_responses(struct netfront_info *np,
+				struct netfront_rx_info *rinfo, RING_IDX rp,
+				struct sk_buff_head *list,
+				int *pages_flipped_p)
+{
+	int pages_flipped = *pages_flipped_p;
+	struct mmu_update *mmu;
+	struct multicall_entry *mcl;
+	struct netif_rx_response *rx = &rinfo->rx;
+	struct netif_extra_info *extras = rinfo->extras;
+	RING_IDX cons = np->rx.rsp_cons;
+	struct sk_buff *skb = xennet_get_rx_skb(np, cons);
+	grant_ref_t ref = xennet_get_rx_ref(np, cons);
+	int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
+	int frags = 1;
+	int err = 0;
+	unsigned long ret;
+
+	if (rx->flags & XEN_NETRXF_extra_info) {
+		err = xennet_get_extras(np, extras, rp);
+		cons = np->rx.rsp_cons;
+	}
+
+	for (;;) {
+		unsigned long mfn;
+
+		if (unlikely(rx->status < 0 ||
+			     rx->offset + rx->status > PAGE_SIZE)) {
+			if (net_ratelimit())
+				WPRINTK("rx->offset: %x, size: %u\n",
+					rx->offset, rx->status);
+			xennet_move_rx_slot(np, skb, ref);
+			err = -EINVAL;
+			goto next;
+		}
+
+		/*
+		 * This definitely indicates a bug, either in this driver or in
+		 * the backend driver. In future this should flag the bad
+		 * situation to the system controller to reboot the backed.
+		 */
+		if (ref == GRANT_INVALID_REF) {
+			if (net_ratelimit())
+				WPRINTK("Bad rx response id %d.\n", rx->id);
+			err = -EINVAL;
+			goto next;
+		}
+
+		if (!np->copying_receiver) {
+			/* Memory pressure, insufficient buffer
+			 * headroom, ... */
+			if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
+				if (net_ratelimit())
+					WPRINTK("Unfulfilled rx req "
+						"(id=%d, st=%d).\n",
+						rx->id, rx->status);
+				xennet_move_rx_slot(np, skb, ref);
+				err = -ENOMEM;
+				goto next;
+			}
+
+			if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+				/* Remap the page. */
+				const struct page *page =
+					skb_frag_page(skb_shinfo(skb)->frags);
+				unsigned long pfn = page_to_pfn(page);
+				void *vaddr = page_address(page);
+
+				mcl = np->rx_mcl + pages_flipped;
+				mmu = np->rx_mmu + pages_flipped;
+
+				MULTI_update_va_mapping(mcl,
+							(unsigned long)vaddr,
+							pfn_pte_ma(mfn,
+								   PAGE_KERNEL),
+							0);
+				mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+					| MMU_MACHPHYS_UPDATE;
+				mmu->val = pfn;
+
+				set_phys_to_machine(pfn, mfn);
+			}
+			pages_flipped++;
+		} else {
+			ret = gnttab_end_foreign_access_ref(ref);
+			BUG_ON(!ret);
+		}
+
+		gnttab_release_grant_reference(&np->gref_rx_head, ref);
+
+		__skb_queue_tail(list, skb);
+
+next:
+		if (!(rx->flags & XEN_NETRXF_more_data))
+			break;
+
+		if (cons + frags == rp) {
+			if (net_ratelimit())
+				WPRINTK("Need more frags\n");
+			err = -ENOENT;
+			break;
+		}
+
+		rx = RING_GET_RESPONSE(&np->rx, cons + frags);
+		skb = xennet_get_rx_skb(np, cons + frags);
+		ref = xennet_get_rx_ref(np, cons + frags);
+		frags++;
+	}
+
+	if (unlikely(frags > max)) {
+		if (net_ratelimit())
+			WPRINTK("Too many frags\n");
+		err = -E2BIG;
+	}
+
+	if (unlikely(err))
+		np->rx.rsp_cons = cons + frags;
+
+	*pages_flipped_p = pages_flipped;
+
+	return err;
+}
+
+static RING_IDX xennet_fill_frags(struct netfront_info *np,
+				  struct sk_buff *skb,
+				  struct sk_buff_head *list)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	int nr_frags = shinfo->nr_frags;
+	RING_IDX cons = np->rx.rsp_cons;
+	struct sk_buff *nskb;
+
+	while ((nskb = __skb_dequeue(list))) {
+		struct netif_rx_response *rx =
+			RING_GET_RESPONSE(&np->rx, ++cons);
+
+		__skb_fill_page_desc(skb, nr_frags,
+				     skb_frag_page(skb_shinfo(nskb)->frags),
+				     rx->offset, rx->status);
+
+		skb->data_len += rx->status;
+
+		skb_shinfo(nskb)->nr_frags = 0;
+		kfree_skb(nskb);
+
+		nr_frags++;
+	}
+
+	shinfo->nr_frags = nr_frags;
+	return cons;
+}
+
+static int xennet_set_skb_gso(struct sk_buff *skb,
+			      struct netif_extra_info *gso)
+{
+	if (!gso->u.gso.size) {
+		if (net_ratelimit())
+			WPRINTK("GSO size must not be zero.\n");
+		return -EINVAL;
+	}
+
+	/* Currently only TCPv4 S.O. is supported. */
+	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+		if (net_ratelimit())
+			WPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+		return -EINVAL;
+	}
+
+#if HAVE_TSO
+	skb_shinfo(skb)->gso_size = gso->u.gso.size;
+#if HAVE_GSO
+	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+	/* Header must be checked, and gso_segs computed. */
+	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+#endif
+	skb_shinfo(skb)->gso_segs = 0;
+
+	return 0;
+#else
+	if (net_ratelimit())
+		WPRINTK("GSO unsupported by this kernel.\n");
+	return -EINVAL;
+#endif
+}
+
+static int netif_poll(struct napi_struct *napi, int budget)
+{
+	struct netfront_info *np = container_of(napi, struct netfront_info, napi);
+	struct netfront_stats *stats = this_cpu_ptr(np->stats);
+	struct net_device *dev = np->netdev;
+	struct sk_buff *skb;
+	struct netfront_rx_info rinfo;
+	struct netif_rx_response *rx = &rinfo.rx;
+	struct netif_extra_info *extras = rinfo.extras;
+	RING_IDX i, rp;
+	struct multicall_entry *mcl;
+	int work_done, more_to_do = 1, accel_more_to_do = 1;
+	struct sk_buff_head rxq;
+	struct sk_buff_head errq;
+	struct sk_buff_head tmpq;
+	unsigned long flags;
+	unsigned int len;
+	int pages_flipped = 0;
+	int err;
+
+	spin_lock(&np->rx_lock); /* no need for spin_lock_bh() in ->poll() */
+
+	if (unlikely(!netfront_carrier_ok(np))) {
+		spin_unlock(&np->rx_lock);
+		return 0;
+	}
+
+	skb_queue_head_init(&rxq);
+	skb_queue_head_init(&errq);
+	skb_queue_head_init(&tmpq);
+
+	rp = np->rx.sring->rsp_prod;
+	rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+	i = np->rx.rsp_cons;
+	work_done = 0;
+	while ((i != rp) && (work_done < budget)) {
+		memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
+		memset(extras, 0, sizeof(rinfo.extras));
+
+		err = xennet_get_responses(np, &rinfo, rp, &tmpq,
+					   &pages_flipped);
+
+		if (unlikely(err)) {
+err:	
+			while ((skb = __skb_dequeue(&tmpq)))
+				__skb_queue_tail(&errq, skb);
+			dev->stats.rx_errors++;
+			i = np->rx.rsp_cons;
+			continue;
+		}
+
+		skb = __skb_dequeue(&tmpq);
+
+		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+			struct netif_extra_info *gso;
+			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+			if (unlikely(xennet_set_skb_gso(skb, gso))) {
+				__skb_queue_head(&tmpq, skb);
+				np->rx.rsp_cons += skb_queue_len(&tmpq);
+				goto err;
+			}
+		}
+
+		NETFRONT_SKB_CB(skb)->page =
+			skb_frag_page(skb_shinfo(skb)->frags);
+		NETFRONT_SKB_CB(skb)->offset = rx->offset;
+
+		len = rx->status;
+		if (len > RX_COPY_THRESHOLD)
+			len = RX_COPY_THRESHOLD;
+		skb_put(skb, len);
+
+		if (rx->status > len) {
+			skb_shinfo(skb)->frags[0].page_offset =
+				rx->offset + len;
+			skb_frag_size_set(skb_shinfo(skb)->frags,
+					  rx->status - len);
+			skb->data_len = rx->status - len;
+		} else {
+			__skb_fill_page_desc(skb, 0, NULL, 0, 0);
+			skb_shinfo(skb)->nr_frags = 0;
+		}
+
+		i = xennet_fill_frags(np, skb, &tmpq);
+
+		/*
+		 * Truesize must approximates the size of true data plus
+		 * any supervisor overheads. Adding hypervisor overheads
+		 * has been shown to significantly reduce achievable
+		 * bandwidth with the default receive buffer size. It is
+		 * therefore not wise to account for it here.
+		 *
+		 * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to
+		 * RX_COPY_THRESHOLD + the supervisor overheads. Here, we
+		 * add the size of the data pulled in xennet_fill_frags().
+		 *
+		 * We also adjust for any unused space in the main data
+		 * area by subtracting (RX_COPY_THRESHOLD - len). This is
+		 * especially important with drivers which split incoming
+		 * packets into header and data, using only 66 bytes of
+		 * the main data area (see the e1000 driver for example.)
+		 * On such systems, without this last adjustement, our
+		 * achievable receive throughout using the standard receive
+		 * buffer size was cut by 25%(!!!).
+		 */
+		skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
+		skb->len += skb->data_len;
+
+		if (rx->flags & XEN_NETRXF_csum_blank)
+			skb->ip_summed = CHECKSUM_PARTIAL;
+		else if (rx->flags & XEN_NETRXF_data_validated)
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		else
+			skb->ip_summed = CHECKSUM_NONE;
+
+		u64_stats_update_begin(&stats->syncp);
+		stats->rx_packets++;
+		stats->rx_bytes += skb->len;
+		u64_stats_update_end(&stats->syncp);
+
+		__skb_queue_tail(&rxq, skb);
+
+		np->rx.rsp_cons = ++i;
+		work_done++;
+	}
+
+	if (pages_flipped) {
+		/* Some pages are no longer absent... */
+		balloon_update_driver_allowance(-pages_flipped);
+
+		/* Do all the remapping work and M2P updates. */
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			mcl = np->rx_mcl + pages_flipped;
+			mcl->op = __HYPERVISOR_mmu_update;
+			mcl->args[0] = (unsigned long)np->rx_mmu;
+			mcl->args[1] = pages_flipped;
+			mcl->args[2] = 0;
+			mcl->args[3] = DOMID_SELF;
+			err = HYPERVISOR_multicall_check(np->rx_mcl,
+							 pages_flipped + 1,
+							 NULL);
+			BUG_ON(err);
+		}
+	}
+
+	__skb_queue_purge(&errq);
+
+	while ((skb = __skb_dequeue(&rxq)) != NULL) {
+		struct page *page = NETFRONT_SKB_CB(skb)->page;
+		void *vaddr = page_address(page);
+		unsigned offset = NETFRONT_SKB_CB(skb)->offset;
+
+		memcpy(skb->data, vaddr + offset, skb_headlen(skb));
+
+		if (page != skb_frag_page(skb_shinfo(skb)->frags))
+			__free_page(page);
+
+		/* Ethernet work: Delayed to here as it peeks the header. */
+		skb->protocol = eth_type_trans(skb, dev);
+
+		if (skb_checksum_setup(skb, &np->rx_gso_csum_fixups)) {
+			kfree_skb(skb);
+			continue;
+		}
+
+		/* Pass it up. */
+		netif_receive_skb(skb);
+	}
+
+	/* If we get a callback with very few responses, reduce fill target. */
+	/* NB. Note exponential increase, linear decrease. */
+	if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
+	     ((3*np->rx_target) / 4)) &&
+	    (--np->rx_target < np->rx_min_target))
+		np->rx_target = np->rx_min_target;
+
+	network_alloc_rx_buffers(dev);
+
+	if (work_done < budget) {
+		/* there's some spare capacity, try the accelerated path */
+		int accel_budget = budget - work_done;
+		int accel_budget_start = accel_budget;
+
+ 		if (np->accel_vif_state.hooks) { 
+ 			accel_more_to_do =  
+ 				np->accel_vif_state.hooks->netdev_poll 
+ 				(dev, &accel_budget); 
+ 			work_done += (accel_budget_start - accel_budget); 
+ 		} else
+			accel_more_to_do = 0;
+	}
+
+	if (work_done < budget) {
+		local_irq_save(flags);
+
+		RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
+
+		if (!more_to_do && !accel_more_to_do && 
+		    np->accel_vif_state.hooks) {
+			/* 
+			 *  Slow path has nothing more to do, see if
+			 *  fast path is likewise
+			 */
+			accel_more_to_do = 
+				np->accel_vif_state.hooks->start_napi_irq(dev);
+		}
+
+		if (!more_to_do && !accel_more_to_do)
+			__napi_complete(napi);
+
+		local_irq_restore(flags);
+	}
+
+	spin_unlock(&np->rx_lock);
+	
+	return work_done;
+}
+
+static void netif_release_tx_bufs(struct netfront_info *np)
+{
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 1; i <= NET_TX_RING_SIZE; i++) {
+		if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
+			continue;
+
+		skb = np->tx_skbs[i];
+		gnttab_end_foreign_access_ref(np->grant_tx_ref[i]);
+		gnttab_release_grant_reference(
+			&np->gref_tx_head, np->grant_tx_ref[i]);
+		np->grant_tx_ref[i] = GRANT_INVALID_REF;
+		add_id_to_freelist(np->tx_skbs, i);
+		dev_kfree_skb_irq(skb);
+	}
+}
+
+static void netif_release_rx_bufs_flip(struct netfront_info *np)
+{
+	struct mmu_update      *mmu = np->rx_mmu;
+	struct multicall_entry *mcl = np->rx_mcl;
+	struct sk_buff_head free_list;
+	struct sk_buff *skb;
+	unsigned long mfn;
+	int xfer = 0, noxfer = 0, unused = 0;
+	int id, ref, rc;
+
+	skb_queue_head_init(&free_list);
+
+	spin_lock_bh(&np->rx_lock);
+
+	for (id = 0; id < NET_RX_RING_SIZE; id++) {
+		struct page *page;
+
+		if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
+			unused++;
+			continue;
+		}
+
+		skb = np->rx_skbs[id];
+		mfn = gnttab_end_foreign_transfer_ref(ref);
+		gnttab_release_grant_reference(&np->gref_rx_head, ref);
+		np->grant_rx_ref[id] = GRANT_INVALID_REF;
+		add_id_to_freelist(np->rx_skbs, id);
+
+		page = skb_frag_page(skb_shinfo(skb)->frags);
+
+		if (0 == mfn) {
+			balloon_release_driver_page(page);
+			skb_shinfo(skb)->nr_frags = 0;
+			dev_kfree_skb(skb);
+			noxfer++;
+			continue;
+		}
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* Remap the page. */
+			unsigned long pfn = page_to_pfn(page);
+			void *vaddr = page_address(page);
+
+			MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
+						pfn_pte_ma(mfn, PAGE_KERNEL),
+						0);
+			mcl++;
+			mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+				| MMU_MACHPHYS_UPDATE;
+			mmu->val = pfn;
+			mmu++;
+
+			set_phys_to_machine(pfn, mfn);
+		}
+		__skb_queue_tail(&free_list, skb);
+		xfer++;
+	}
+
+	DPRINTK("%s: %d xfer, %d noxfer, %d unused\n",
+		__FUNCTION__, xfer, noxfer, unused);
+
+	if (xfer) {
+		/* Some pages are no longer absent... */
+		balloon_update_driver_allowance(-xfer);
+
+		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			/* Do all the remapping work and M2P updates. */
+			mcl->op = __HYPERVISOR_mmu_update;
+			mcl->args[0] = (unsigned long)np->rx_mmu;
+			mcl->args[1] = mmu - np->rx_mmu;
+			mcl->args[2] = 0;
+			mcl->args[3] = DOMID_SELF;
+			mcl++;
+			rc = HYPERVISOR_multicall_check(
+				np->rx_mcl, mcl - np->rx_mcl, NULL);
+			BUG_ON(rc);
+		}
+	}
+
+	__skb_queue_purge(&free_list);
+
+	spin_unlock_bh(&np->rx_lock);
+}
+
+static void netif_release_rx_bufs_copy(struct netfront_info *np)
+{
+	struct sk_buff *skb;
+	int i, ref;
+	int busy = 0, inuse = 0;
+
+	spin_lock_bh(&np->rx_lock);
+
+	for (i = 0; i < NET_RX_RING_SIZE; i++) {
+		ref = np->grant_rx_ref[i];
+
+		if (ref == GRANT_INVALID_REF)
+			continue;
+
+		inuse++;
+
+		skb = np->rx_skbs[i];
+
+		if (!gnttab_end_foreign_access_ref(ref))
+		{
+			busy++;
+			continue;
+		}
+
+		gnttab_release_grant_reference(&np->gref_rx_head, ref);
+		np->grant_rx_ref[i] = GRANT_INVALID_REF;
+		add_id_to_freelist(np->rx_skbs, i);
+
+		dev_kfree_skb(skb);
+	}
+
+	if (busy)
+		DPRINTK("%s: Unable to release %d of %d inuse grant references out of %ld total.\n",
+			__FUNCTION__, busy, inuse, NET_RX_RING_SIZE);
+
+	spin_unlock_bh(&np->rx_lock);
+}
+
+static int network_close(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	netif_stop_queue(np->netdev);
+	napi_disable(&np->napi);
+	return 0;
+}
+
+
+static int xennet_set_mac_address(struct net_device *dev, void *p)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	struct sockaddr *addr = p;
+
+	if (netif_running(dev))
+		return -EBUSY;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+	memcpy(np->mac, addr->sa_data, ETH_ALEN);
+
+	return 0;
+}
+
+static int xennet_change_mtu(struct net_device *dev, int mtu)
+{
+	int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+	if (mtu > max)
+		return -EINVAL;
+	dev->mtu = mtu;
+	return 0;
+}
+
+static struct rtnl_link_stats64 *xennet_get_stats64(struct net_device *dev,
+						    struct rtnl_link_stats64 *tot)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	int cpu;
+
+	netfront_accelerator_call_get_stats(np, dev);
+
+	for_each_possible_cpu(cpu) {
+		struct netfront_stats *stats = per_cpu_ptr(np->stats, cpu);
+		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+		unsigned int start;
+
+		do {
+			start = u64_stats_fetch_begin_bh(&stats->syncp);
+
+			rx_packets = stats->rx_packets;
+			tx_packets = stats->tx_packets;
+			rx_bytes = stats->rx_bytes;
+			tx_bytes = stats->tx_bytes;
+		} while (u64_stats_fetch_retry_bh(&stats->syncp, start));
+
+		tot->rx_packets += rx_packets;
+		tot->tx_packets += tx_packets;
+		tot->rx_bytes   += rx_bytes;
+		tot->tx_bytes   += tx_bytes;
+	}
+
+	tot->rx_errors  = dev->stats.rx_errors;
+	tot->tx_dropped = dev->stats.tx_dropped;
+
+	return tot;
+}
+
+static const struct xennet_stat {
+	char name[ETH_GSTRING_LEN];
+	u16 offset;
+} xennet_stats[] = {
+	{
+		"rx_gso_csum_fixups",
+		offsetof(struct netfront_info, rx_gso_csum_fixups) / sizeof(long)
+	},
+};
+
+static int xennet_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(xennet_stats);
+	}
+	return -EOPNOTSUPP;
+}
+
+static void xennet_get_ethtool_stats(struct net_device *dev,
+				     struct ethtool_stats *stats, u64 *data)
+{
+	unsigned long *np = netdev_priv(dev);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(xennet_stats); i++)
+		data[i] = np[xennet_stats[i].offset];
+}
+
+static void xennet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	unsigned int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < ARRAY_SIZE(xennet_stats); i++)
+			memcpy(data + i * ETH_GSTRING_LEN,
+			       xennet_stats[i].name, ETH_GSTRING_LEN);
+		break;
+	}
+}
+
+static void netfront_get_drvinfo(struct net_device *dev,
+				 struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, "netfront");
+	strlcpy(info->bus_info, dev_name(dev->dev.parent),
+		ARRAY_SIZE(info->bus_info));
+}
+
+static int network_connect(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	int i, requeue_idx, err;
+	struct sk_buff *skb;
+	grant_ref_t ref;
+	netif_rx_request_t *req;
+	unsigned int feature_rx_copy, feature_rx_flip;
+
+	err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+			   "feature-rx-copy", "%u", &feature_rx_copy);
+	if (err != 1)
+		feature_rx_copy = 0;
+	err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+			   "feature-rx-flip", "%u", &feature_rx_flip);
+	if (err != 1)
+		feature_rx_flip = 1;
+
+	/*
+	 * Copy packets on receive path if:
+	 *  (a) This was requested by user, and the backend supports it; or
+	 *  (b) Flipping was requested, but this is unsupported by the backend.
+	 */
+	np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) ||
+				(MODPARM_rx_flip && !feature_rx_flip));
+
+	err = talk_to_backend(np->xbdev, np);
+	if (err)
+		return err;
+
+	rtnl_lock();
+	netdev_update_features(dev);
+	rtnl_unlock();
+
+	DPRINTK("device %s has %sing receive path.\n",
+		dev->name, np->copying_receiver ? "copy" : "flipp");
+
+	spin_lock_bh(&np->rx_lock);
+	spin_lock_irq(&np->tx_lock);
+
+	/*
+	 * Recovery procedure:
+	 *  NB. Freelist index entries are always going to be less than
+	 *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
+	 *  greater than PAGE_OFFSET: we use this property to distinguish
+	 *  them.
+	 */
+
+	/* Step 1: Discard all pending TX packet fragments. */
+	netif_release_tx_bufs(np);
+
+	/* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
+	for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
+		unsigned long pfn;
+
+		if (!np->rx_skbs[i])
+			continue;
+
+		skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
+		ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
+		req = RING_GET_REQUEST(&np->rx, requeue_idx);
+		pfn = page_to_pfn(skb_frag_page(skb_shinfo(skb)->frags));
+
+		if (!np->copying_receiver) {
+			gnttab_grant_foreign_transfer_ref(
+				ref, np->xbdev->otherend_id, pfn);
+		} else {
+			gnttab_grant_foreign_access_ref(
+				ref, np->xbdev->otherend_id,
+				pfn_to_mfn(pfn), 0);
+		}
+		req->gref = ref;
+		req->id   = requeue_idx;
+
+		requeue_idx++;
+	}
+
+	np->rx.req_prod_pvt = requeue_idx;
+
+	/*
+	 * Step 3: All public and private state should now be sane.  Get
+	 * ready to start sending and receiving packets and give the driver
+	 * domain a kick because we've probably just requeued some
+	 * packets.
+	 */
+	netfront_carrier_on(np);
+	notify_remote_via_irq(np->irq);
+	network_tx_buf_gc(dev);
+	network_alloc_rx_buffers(dev);
+
+	spin_unlock_irq(&np->tx_lock);
+	spin_unlock_bh(&np->rx_lock);
+
+	return 0;
+}
+
+static void netif_uninit(struct net_device *dev)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	netif_release_tx_bufs(np);
+	if (np->copying_receiver)
+		netif_release_rx_bufs_copy(np);
+	else
+		netif_release_rx_bufs_flip(np);
+	gnttab_free_grant_references(np->gref_tx_head);
+	gnttab_free_grant_references(np->gref_rx_head);
+}
+
+static const struct ethtool_ops network_ethtool_ops =
+{
+	.get_drvinfo = netfront_get_drvinfo,
+	.get_link = ethtool_op_get_link,
+
+	.get_sset_count = xennet_get_sset_count,
+	.get_ethtool_stats = xennet_get_ethtool_stats,
+	.get_strings = xennet_get_strings,
+};
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_rxbuf_min(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct netfront_info *info = netdev_priv(to_net_dev(dev));
+
+	return sprintf(buf, "%u\n", info->rx_min_target);
+}
+
+static ssize_t store_rxbuf_min(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	struct netfront_info *np = netdev_priv(netdev);
+	char *endp;
+	unsigned long target;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	target = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EBADMSG;
+
+	if (target < RX_MIN_TARGET)
+		target = RX_MIN_TARGET;
+	if (target > RX_MAX_TARGET)
+		target = RX_MAX_TARGET;
+
+	spin_lock_bh(&np->rx_lock);
+	if (target > np->rx_max_target)
+		np->rx_max_target = target;
+	np->rx_min_target = target;
+	if (target > np->rx_target)
+		np->rx_target = target;
+
+	network_alloc_rx_buffers(netdev);
+
+	spin_unlock_bh(&np->rx_lock);
+	return len;
+}
+
+static ssize_t show_rxbuf_max(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct netfront_info *info = netdev_priv(to_net_dev(dev));
+
+	return sprintf(buf, "%u\n", info->rx_max_target);
+}
+
+static ssize_t store_rxbuf_max(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct net_device *netdev = to_net_dev(dev);
+	struct netfront_info *np = netdev_priv(netdev);
+	char *endp;
+	unsigned long target;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	target = simple_strtoul(buf, &endp, 0);
+	if (endp == buf)
+		return -EBADMSG;
+
+	if (target < RX_MIN_TARGET)
+		target = RX_MIN_TARGET;
+	if (target > RX_MAX_TARGET)
+		target = RX_MAX_TARGET;
+
+	spin_lock_bh(&np->rx_lock);
+	if (target < np->rx_min_target)
+		np->rx_min_target = target;
+	np->rx_max_target = target;
+	if (target < np->rx_target)
+		np->rx_target = target;
+
+	network_alloc_rx_buffers(netdev);
+
+	spin_unlock_bh(&np->rx_lock);
+	return len;
+}
+
+static ssize_t show_rxbuf_cur(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct netfront_info *info = netdev_priv(to_net_dev(dev));
+
+	return sprintf(buf, "%u\n", info->rx_target);
+}
+
+static struct device_attribute xennet_attrs[] = {
+	__ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
+	__ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
+	__ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
+};
+
+static int xennet_sysfs_addif(struct net_device *netdev)
+{
+	int i;
+	int error = 0;
+
+	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
+		error = device_create_file(&netdev->dev,
+					   &xennet_attrs[i]);
+		if (error)
+			goto fail;
+	}
+	return 0;
+
+ fail:
+	while (--i >= 0)
+		device_remove_file(&netdev->dev, &xennet_attrs[i]);
+	return error;
+}
+
+static void xennet_sysfs_delif(struct net_device *netdev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
+		device_remove_file(&netdev->dev, &xennet_attrs[i]);
+}
+
+#endif /* CONFIG_SYSFS */
+
+
+/*
+ * Nothing to do here. Virtual interface is point-to-point and the
+ * physical interface is probably promiscuous anyway.
+ */
+static void network_set_multicast_list(struct net_device *dev)
+{
+}
+
+static netdev_features_t xennet_fix_features(struct net_device *dev,
+					     netdev_features_t features)
+{
+	struct netfront_info *np = netdev_priv(dev);
+	int val;
+
+	if (features & NETIF_F_SG) {
+		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
+				 "%d", &val) < 0)
+			val = 0;
+
+		if (!val)
+			features &= ~NETIF_F_SG;
+	}
+
+	if (features & NETIF_F_TSO) {
+		if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+				 "feature-gso-tcpv4", "%d", &val) < 0)
+			val = 0;
+
+		if (!val)
+			features &= ~NETIF_F_TSO;
+	}
+
+	return features;
+}
+
+static int xennet_set_features(struct net_device *dev,
+			       netdev_features_t features)
+{
+	if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN) {
+		netdev_info(dev, "Reducing MTU because no SG offload");
+		dev->mtu = ETH_DATA_LEN;
+	}
+
+	return 0;
+}
+
+static const struct net_device_ops xennet_netdev_ops = {
+	.ndo_uninit             = netif_uninit,
+	.ndo_open               = network_open,
+	.ndo_stop               = network_close,
+	.ndo_start_xmit         = network_start_xmit,
+	.ndo_set_rx_mode        = network_set_multicast_list,
+	.ndo_set_mac_address    = xennet_set_mac_address,
+	.ndo_validate_addr      = eth_validate_addr,
+	.ndo_fix_features       = xennet_fix_features,
+	.ndo_set_features       = xennet_set_features,
+	.ndo_change_mtu	        = xennet_change_mtu,
+	.ndo_get_stats64        = xennet_get_stats64,
+};
+
+static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
+{
+	int i, err = 0;
+	struct net_device *netdev = NULL;
+	struct netfront_info *np = NULL;
+
+	netdev = alloc_etherdev(sizeof(struct netfront_info));
+	if (!netdev) {
+		pr_warning("%s: alloc_etherdev failed\n", __FUNCTION__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	np                   = netdev_priv(netdev);
+	np->xbdev            = dev;
+
+	spin_lock_init(&np->tx_lock);
+	spin_lock_init(&np->rx_lock);
+
+	init_accelerator_vif(np, dev);
+
+	skb_queue_head_init(&np->rx_batch);
+	np->rx_target     = RX_DFL_MIN_TARGET;
+	np->rx_min_target = RX_DFL_MIN_TARGET;
+	np->rx_max_target = RX_MAX_TARGET;
+
+	init_timer(&np->rx_refill_timer);
+	np->rx_refill_timer.data = (unsigned long)netdev;
+	np->rx_refill_timer.function = rx_refill_timeout;
+
+	err = -ENOMEM;
+	np->stats = alloc_percpu(struct netfront_stats);
+	if (np->stats == NULL)
+		goto exit;
+
+	/* Initialise {tx,rx}_skbs as a free chain containing every entry. */
+	for (i = 0; i <= NET_TX_RING_SIZE; i++) {
+		np->tx_skbs[i] = (void *)((unsigned long) i+1);
+		np->grant_tx_ref[i] = GRANT_INVALID_REF;
+	}
+
+	for (i = 0; i < NET_RX_RING_SIZE; i++) {
+		np->rx_skbs[i] = NULL;
+		np->grant_rx_ref[i] = GRANT_INVALID_REF;
+	}
+
+	/* A grant for every tx ring slot */
+	if (gnttab_alloc_grant_references(TX_MAX_TARGET,
+					  &np->gref_tx_head) < 0) {
+		pr_alert("#### netfront can't alloc tx grant refs\n");
+		err = -ENOMEM;
+		goto exit_free_stats;
+	}
+	/* A grant for every rx ring slot */
+	if (gnttab_alloc_grant_references(RX_MAX_TARGET,
+					  &np->gref_rx_head) < 0) {
+		pr_alert("#### netfront can't alloc rx grant refs\n");
+		err = -ENOMEM;
+		goto exit_free_tx;
+	}
+
+	netdev->netdev_ops	= &xennet_netdev_ops;
+	netif_napi_add(netdev, &np->napi, netif_poll, 64);
+	netdev->features        = NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
+				  NETIF_F_GSO_ROBUST;
+	netdev->hw_features	= NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO;
+
+	/*
+         * Assume that all hw features are available for now. This set
+         * will be adjusted by the call to netdev_update_features() in
+         * xennet_connect() which is the earliest point where we can
+         * negotiate with the backend regarding supported features.
+         */
+	netdev->features |= netdev->hw_features;
+
+	SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
+	SET_NETDEV_DEV(netdev, &dev->dev);
+
+	np->netdev = netdev;
+
+	netfront_carrier_off(np);
+
+	return netdev;
+
+ exit_free_tx:
+	gnttab_free_grant_references(np->gref_tx_head);
+ exit_free_stats:
+	free_percpu(np->stats);
+ exit:
+	free_netdev(netdev);
+	return ERR_PTR(err);
+}
+
+static void netif_release_rings(struct netfront_info *info)
+{
+	end_access(info->tx_ring_ref, info->tx.sring);
+	end_access(info->rx_ring_ref, info->rx.sring);
+	info->tx_ring_ref = GRANT_INVALID_REF;
+	info->rx_ring_ref = GRANT_INVALID_REF;
+	info->tx.sring = NULL;
+	info->rx.sring = NULL;
+}
+
+static void netif_disconnect_backend(struct netfront_info *info)
+{
+	/* Stop old i/f to prevent errors whilst we rebuild the state. */
+	spin_lock_bh(&info->rx_lock);
+	spin_lock_irq(&info->tx_lock);
+	netfront_carrier_off(info);
+	spin_unlock_irq(&info->tx_lock);
+	spin_unlock_bh(&info->rx_lock);
+
+	if (info->irq)
+		unbind_from_irqhandler(info->irq, info->netdev);
+	info->irq = 0;
+
+	netif_release_rings(info);
+}
+
+
+static void end_access(int ref, void *page)
+{
+	if (ref != GRANT_INVALID_REF)
+		gnttab_end_foreign_access(ref, (unsigned long)page);
+}
+
+
+/* ** Driver registration ** */
+
+
+static const struct xenbus_device_id netfront_ids[] = {
+	{ "vif" },
+	{ "" }
+};
+MODULE_ALIAS("xen:vif");
+
+static DEFINE_XENBUS_DRIVER(netfront, ,
+	.probe = netfront_probe,
+	.remove = __devexit_p(netfront_remove),
+	.suspend = netfront_suspend,
+	.suspend_cancel = netfront_suspend_cancel,
+	.resume = netfront_resume,
+	.otherend_changed = backend_changed,
+);
+
+
+static int __init netif_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+#ifdef CONFIG_XEN
+	if (MODPARM_rx_flip && MODPARM_rx_copy) {
+		WPRINTK("Cannot specify both rx_copy and rx_flip.\n");
+		return -EINVAL;
+	}
+
+	if (!MODPARM_rx_flip && !MODPARM_rx_copy)
+		MODPARM_rx_copy = true; /* Default is to copy. */
+#endif
+
+	netif_init_accel();
+
+	IPRINTK("Initialising virtual ethernet driver.\n");
+
+	return xenbus_register_frontend(&netfront_driver);
+}
+module_init(netif_init);
+
+
+static void __exit netif_exit(void)
+{
+	xenbus_unregister_driver(&netfront_driver);
+
+	netif_exit_accel();
+}
+module_exit(netif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/netfront/netfront.h b/drivers/xen/netfront/netfront.h
new file mode 100644
index 0000000..410a881
--- /dev/null
+++ b/drivers/xen/netfront/netfront.h
@@ -0,0 +1,288 @@
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef NETFRONT_H
+#define NETFRONT_H
+
+#include <xen/interface/io/netif.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+
+#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE)
+#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE)
+
+#include <xen/xenbus.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+struct netfront_stats {
+	u64			rx_packets;
+	u64			tx_packets;
+	u64			rx_bytes;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+};
+
+/* 
+ * Function pointer table for hooks into a network acceleration
+ * plugin.  These are called at appropriate points from the netfront
+ * driver 
+ */
+struct netfront_accel_hooks {
+	/* 
+	 * new_device: Accelerator hook to ask the plugin to support a
+	 * new network interface
+	 */
+	int (*new_device)(struct net_device *net_dev, struct xenbus_device *dev);
+	/*
+	 * remove: Opposite of new_device
+	 */
+	int (*remove)(struct xenbus_device *dev);
+	/*
+	 * The net_device is being polled, check the accelerated
+	 * hardware for any pending packets
+	 */
+	int (*netdev_poll)(struct net_device *dev, int *pbudget);
+	/*
+	 * start_xmit: Used to give the accelerated plugin the option
+	 * of sending a packet.  Returns non-zero if has done so, or
+	 * zero to decline and force the packet onto normal send
+	 * path
+	 */
+	int (*start_xmit)(struct sk_buff *skb, struct net_device *dev);
+	/* 
+	 * start/stop_napi_interrupts Used by netfront to indicate
+	 * when napi interrupts should be enabled or disabled 
+	 */
+	int (*start_napi_irq)(struct net_device *dev);
+	void (*stop_napi_irq)(struct net_device *dev);
+	/* 
+	 * Called before re-enabling the TX queue to check the fast
+	 * path has slots too
+	 */
+	int (*check_ready)(struct net_device *dev);
+	/*
+	 * Get the fastpath network statistics
+	 */
+	int (*get_stats)(struct net_device *dev,
+			 struct net_device_stats *dev_stats,
+			 struct netfront_stats *link_stats);
+};
+
+
+/* Version of API/protocol for communication between netfront and
+   acceleration plugin supported */
+#define NETFRONT_ACCEL_VERSION 0x00010003
+
+/* 
+ * Per-netfront device state for the accelerator.  This is used to
+ * allow efficient per-netfront device access to the accelerator
+ * hooks 
+ */
+struct netfront_accel_vif_state {
+	struct list_head link;
+
+	struct xenbus_device *dev;
+	struct netfront_info *np;
+	struct netfront_accel_hooks *hooks;
+
+	/* Watch on the accelerator configuration value */
+	struct xenbus_watch accel_watch;
+	/* Work item to process change in accelerator */
+	struct work_struct accel_work;
+	/* The string from xenbus last time accel_watch fired */
+	char *accel_frontend;
+}; 
+
+/* 
+ * Per-accelerator state stored in netfront.  These form a list that
+ * is used to track which devices are accelerated by which plugins,
+ * and what plugins are available/have been requested 
+ */
+struct netfront_accelerator {
+	/* Used to make a list */
+	struct list_head link;
+	/* ID of the accelerator */
+	int id;
+	/*
+	 * String describing the accelerator.  Currently this is the
+	 * name of the accelerator module.  This is provided by the
+	 * backend accelerator through xenstore 
+	 */
+	char *frontend;
+	/* The hooks into the accelerator plugin module */
+	struct netfront_accel_hooks *hooks;
+
+	/* 
+	 * List of per-netfront device state (struct
+	 * netfront_accel_vif_state) for each netfront device that is
+	 * using this accelerator
+	 */
+	struct list_head vif_states;
+	spinlock_t vif_states_lock;
+};
+
+struct netfront_info {
+	struct list_head list;
+	struct net_device *netdev;
+
+	struct netif_tx_front_ring tx;
+	struct netif_rx_front_ring rx;
+
+	spinlock_t   tx_lock;
+	spinlock_t   rx_lock;
+
+	struct napi_struct	napi;
+
+	unsigned int irq;
+	unsigned int copying_receiver;
+	unsigned int carrier;
+
+	/* Receive-ring batched refills. */
+#define RX_MIN_TARGET 8
+#define RX_DFL_MIN_TARGET 64
+#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+	unsigned rx_min_target, rx_max_target, rx_target;
+	struct sk_buff_head rx_batch;
+
+	struct timer_list rx_refill_timer;
+
+	/*
+	 * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs
+	 * is an index into a chain of free entries.
+	 */
+	struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
+	struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
+
+#define TX_MAX_TARGET min_t(int, NET_TX_RING_SIZE, 256)
+	grant_ref_t gref_tx_head;
+	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
+	grant_ref_t gref_rx_head;
+	grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
+
+	struct xenbus_device *xbdev;
+	int tx_ring_ref;
+	int rx_ring_ref;
+	u8 mac[ETH_ALEN];
+
+	unsigned long rx_pfn_array[NET_RX_RING_SIZE];
+	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
+	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+
+	/* Statistics */
+	struct netfront_stats __percpu *stats;
+	unsigned long rx_gso_csum_fixups;
+
+	/* Private pointer to state internal to accelerator module */
+	void *accel_priv;
+	/* The accelerator used by this netfront device */
+	struct netfront_accelerator *accelerator;
+	/* The accelerator state for this netfront device */
+	struct netfront_accel_vif_state accel_vif_state;
+};
+
+
+/* Exported Functions */
+
+/*
+ * Called by an accelerator plugin module when it has loaded.
+ *
+ * frontend: the string describing the accelerator, currently the module name 
+ * hooks: the hooks for netfront to use to call into the accelerator
+ * version: the version of API between frontend and plugin requested
+ * 
+ * return: 0 on success, <0 on error, >0 (with version supported) on
+ * version mismatch
+ */
+extern int netfront_accelerator_loaded(int version, const char *frontend, 
+				       struct netfront_accel_hooks *hooks);
+
+/* 
+ * Called by an accelerator plugin module when it is about to unload.
+ *
+ * frontend: the string describing the accelerator.  Must match the
+ * one passed to netfront_accelerator_loaded()
+ */ 
+extern void netfront_accelerator_stop(const char *frontend);
+
+/* 
+ * Called by an accelerator before waking the net device's TX queue to
+ * ensure the slow path has available slots.  Returns true if OK to
+ * wake, false if still busy 
+ */
+extern int netfront_check_queue_ready(struct net_device *net_dev);
+
+
+/* Internal-to-netfront Functions */
+
+/* 
+ * Call into accelerator and check to see if it has tx space before we
+ * wake the net device's TX queue.  Returns true if OK to wake, false
+ * if still busy
+ */ 
+extern 
+int netfront_check_accelerator_queue_ready(struct net_device *dev,
+					   struct netfront_info *np);
+extern
+int netfront_accelerator_call_remove(struct netfront_info *np,
+				     struct xenbus_device *dev);
+extern
+int netfront_accelerator_suspend(struct netfront_info *np,
+				 struct xenbus_device *dev);
+extern
+int netfront_accelerator_suspend_cancel(struct netfront_info *np,
+					struct xenbus_device *dev);
+extern
+void netfront_accelerator_resume(struct netfront_info *np,
+				 struct xenbus_device *dev);
+extern
+void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np,
+					     struct net_device *dev);
+extern
+int netfront_accelerator_call_get_stats(struct netfront_info *np,
+					struct net_device *dev);
+extern
+void netfront_accelerator_add_watch(struct netfront_info *np);
+
+extern
+void netif_init_accel(void);
+extern
+void netif_exit_accel(void);
+
+extern
+void init_accelerator_vif(struct netfront_info *np,
+			  struct xenbus_device *dev);
+#endif /* NETFRONT_H */
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
index b84bf0b..19f694b 100644
--- a/drivers/xen/pci.c
+++ b/drivers/xen/pci.c
@@ -23,11 +23,20 @@
 #include <xen/interface/physdev.h>
 #include <xen/interface/xen.h>
 
+#ifdef CONFIG_PARAVIRT_XEN
+#define CONFIG_XEN_COMPAT 0x040000
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
+#else
+#include <asm/hypervisor.h>
+#endif
 #include "../pci/pci.h"
 
+#if CONFIG_XEN_COMPAT < 0x040200
 static bool __read_mostly pci_seg_supported = true;
+#else
+#define pci_seg_supported true
+#endif
 
 static int xen_add_device(struct device *dev)
 {
@@ -86,7 +95,9 @@ static int xen_add_device(struct device *dev)
 		r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add);
 		if (r != -ENOSYS)
 			return r;
+#if CONFIG_XEN_COMPAT < 0x040200
 		pci_seg_supported = false;
+#endif
 	}
 
 	if (pci_domain_nr(pci_dev->bus))
diff --git a/drivers/xen/pcifront/Makefile b/drivers/xen/pcifront/Makefile
new file mode 100644
index 0000000..4ceb18a
--- /dev/null
+++ b/drivers/xen/pcifront/Makefile
@@ -0,0 +1,5 @@
+obj-y += pcifront.o
+
+pcifront-y := pci_op.o xenbus.o pci.o
+
+ccflags-$(CONFIG_XEN_PCIDEV_FE_DEBUG) += -DDEBUG
diff --git a/drivers/xen/pcifront/pci.c b/drivers/xen/pcifront/pci.c
new file mode 100644
index 0000000..198d65f
--- /dev/null
+++ b/drivers/xen/pcifront/pci.c
@@ -0,0 +1,44 @@
+/*
+ * PCI Frontend Operations - ensure only one PCI frontend runs at a time
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pcifront.h"
+
+DEFINE_SPINLOCK(pcifront_dev_lock);
+static struct pcifront_device *pcifront_dev = NULL;
+
+int pcifront_connect(struct pcifront_device *pdev)
+{
+	int err = 0;
+
+	spin_lock(&pcifront_dev_lock);
+
+	if (!pcifront_dev) {
+		dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
+		pcifront_dev = pdev;
+	}
+	else {
+		dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
+		err = -EEXIST;
+	}
+
+	spin_unlock(&pcifront_dev_lock);
+
+	return err;
+}
+
+void pcifront_disconnect(struct pcifront_device *pdev)
+{
+	spin_lock(&pcifront_dev_lock);
+
+	if (pdev == pcifront_dev) {
+		dev_info(&pdev->xdev->dev,
+			 "Disconnecting PCI Frontend Buses\n");
+		pcifront_dev = NULL;
+	}
+
+	spin_unlock(&pcifront_dev_lock);
+}
diff --git a/drivers/xen/pcifront/pci_op.c b/drivers/xen/pcifront/pci_op.c
new file mode 100644
index 0000000..8e3faa9
--- /dev/null
+++ b/drivers/xen/pcifront/pci_op.c
@@ -0,0 +1,671 @@
+/*
+ * PCI Frontend Operations - Communicates with frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <asm/bitops.h>
+#include <linux/time.h>
+#include <xen/evtchn.h>
+#include "pcifront.h"
+
+static int verbose_request = 0;
+module_param(verbose_request, int, 0644);
+
+#ifdef __ia64__
+static void pcifront_init_sd(struct pcifront_sd *sd,
+			     unsigned int domain, unsigned int bus,
+			     struct pcifront_device *pdev)
+{
+	int err, i, j, k, len, root_num, res_count;
+	struct acpi_resource res;
+	unsigned int d, b, byte;
+	unsigned long magic;
+	char str[64], tmp[3];
+	unsigned char *buf, *bufp;
+	u8 *ptr;
+
+	memset(sd, 0, sizeof(*sd));
+
+	sd->segment = domain;
+	sd->node = -1;	/* Revisit for NUMA */
+	sd->platform_data = pdev;
+
+	/* Look for resources for this controller in xenbus. */
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "root_num",
+			   "%d", &root_num);
+	if (err != 1)
+		return;
+
+	for (i = 0; i < root_num; i++) {
+		len = snprintf(str, sizeof(str), "root-%d", i);
+		if (unlikely(len >= (sizeof(str) - 1)))
+			return;
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+				   str, "%x:%x", &d, &b);
+		if (err != 2)
+			return;
+
+		if (d == domain && b == bus)
+			break;
+	}
+
+	if (i == root_num)
+		return;
+
+	len = snprintf(str, sizeof(str), "root-resource-magic");
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+			   str, "%lx", &magic);
+
+	if (err != 1)
+		return; /* No resources, nothing to do */
+
+	if (magic != (sizeof(res) * 2) + 1) {
+		pr_warning("pcifront: resource magic mismatch\n");
+		return;
+	}
+
+	len = snprintf(str, sizeof(str), "root-%d-resources", i);
+	if (unlikely(len >= (sizeof(str) - 1)))
+		return;
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+			   str, "%d", &res_count);
+
+	if (err != 1)
+		return; /* No resources, nothing to do */
+
+	sd->window = kzalloc(sizeof(*sd->window) * res_count, GFP_KERNEL);
+	if (!sd->window)
+		return;
+
+	/* magic is also the size of the byte stream in xenbus */
+	buf = kmalloc(magic, GFP_KERNEL);
+	if (!buf) {
+		kfree(sd->window);
+		sd->window = NULL;
+		return;
+	}
+
+	/* Read the resources out of xenbus */
+	for (j = 0; j < res_count; j++) {
+		memset(&res, 0, sizeof(res));
+		memset(buf, 0, magic);
+
+		len = snprintf(str, sizeof(str), "root-%d-resource-%d", i, j);
+		if (unlikely(len >= (sizeof(str) - 1)))
+			return;
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+				   "%s", buf);
+		if (err != 1) {
+			pr_warning("pcifront: error reading "
+				   "resource %d on bus %04x:%02x\n",
+				   j, domain, bus);
+			continue;
+		}
+
+		bufp = buf;
+		ptr = (u8 *)&res;
+		memset(tmp, 0, sizeof(tmp));
+
+		/* Copy ASCII byte stream into structure */
+		for (k = 0; k < magic - 1; k += 2) {
+			memcpy(tmp, bufp, 2);
+			bufp += 2;
+
+			sscanf(tmp, "%02x", &byte);
+			*ptr = byte;
+			ptr++;
+		}
+
+		xen_add_resource(sd, domain, bus, &res);
+		sd->windows++;
+	}
+	kfree(buf);
+}
+#endif
+
+static int errno_to_pcibios_err(int errno)
+{
+	switch (errno) {
+	case XEN_PCI_ERR_success:
+		return PCIBIOS_SUCCESSFUL;
+
+	case XEN_PCI_ERR_dev_not_found:
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	case XEN_PCI_ERR_invalid_offset:
+	case XEN_PCI_ERR_op_failed:
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+
+	case XEN_PCI_ERR_not_implemented:
+		return PCIBIOS_FUNC_NOT_SUPPORTED;
+
+	case XEN_PCI_ERR_access_denied:
+		return PCIBIOS_SET_FAILED;
+	}
+	return errno;
+}
+
+static inline void schedule_pcifront_aer_op(struct pcifront_device *pdev)
+{
+	if (test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+		&& !test_and_set_bit(_PDEVB_op_active, &pdev->flags)) {
+		dev_dbg(&pdev->xdev->dev, "schedule aer frontend job\n");
+		schedule_work(&pdev->op_work);
+	}
+}
+
+static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
+{
+	int err = 0;
+	struct xen_pci_op *active_op = &pdev->sh_info->op;
+	unsigned long irq_flags;
+	evtchn_port_t port = pdev->evtchn;
+	s64 ns, ns_timeout;
+	struct timeval tv;
+
+	spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
+
+	memcpy(active_op, op, sizeof(struct xen_pci_op));
+
+	/* Go */
+	wmb();
+	set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+	notify_remote_via_evtchn(port);
+
+	/*
+	 * We set a poll timeout of 3 seconds but give up on return after
+	 * 2 seconds. It is better to time out too late rather than too early
+	 * (in the latter case we end up continually re-executing poll() with a
+	 * timeout in the past). 1s difference gives plenty of slack for error.
+	 */
+	do_gettimeofday(&tv);
+	ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
+
+	clear_evtchn(port);
+
+	while (test_bit(_XEN_PCIF_active,
+			(unsigned long *)&pdev->sh_info->flags)) {
+		if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
+			BUG();
+		clear_evtchn(port);
+		do_gettimeofday(&tv);
+		ns = timeval_to_ns(&tv);
+		if (ns > ns_timeout) {
+			dev_err(&pdev->xdev->dev,
+				"pciback not responding!!!\n");
+			clear_bit(_XEN_PCIF_active,
+				  (unsigned long *)&pdev->sh_info->flags);
+			err = XEN_PCI_ERR_dev_not_found;
+			goto out;
+		}
+	}
+
+	/*
+	* We might lose backend service request since we 
+	* reuse same evtchn with pci_conf backend response. So re-schedule
+	* aer pcifront service.
+	*/
+	if (test_bit(_XEN_PCIB_active, 
+			(unsigned long*)&pdev->sh_info->flags)) {
+		dev_err(&pdev->xdev->dev, 
+			"schedule aer pcifront service\n");
+		schedule_pcifront_aer_op(pdev);
+	}
+
+	memcpy(op, active_op, sizeof(struct xen_pci_op));
+
+	err = op->err;
+      out:
+	spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
+	return err;
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
+			     int where, int size, u32 * val)
+{
+	int err = 0;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_conf_read,
+		.domain = pci_domain_nr(bus),
+		.bus    = bus->number,
+		.devfn  = devfn,
+		.offset = where,
+		.size   = size,
+	};
+	struct pcifront_sd *sd = bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	if (verbose_request)
+		dev_info(&pdev->xdev->dev,
+			 "read dev=%04x:%02x:%02x.%u - offset %x size %d\n",
+			 pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
+			 PCI_FUNC(devfn), where, size);
+
+	err = do_pci_op(pdev, &op);
+
+	if (likely(!err)) {
+		if (verbose_request)
+			dev_info(&pdev->xdev->dev, "read got back value %x\n",
+				 op.value);
+
+		*val = op.value;
+	} else if (err == -ENODEV) {
+		/* No device here, pretend that it just returned 0 */
+		err = 0;
+		*val = 0;
+	}
+
+	return errno_to_pcibios_err(err);
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
+			      int where, int size, u32 val)
+{
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_conf_write,
+		.domain = pci_domain_nr(bus),
+		.bus    = bus->number,
+		.devfn  = devfn,
+		.offset = where,
+		.size   = size,
+		.value  = val,
+	};
+	struct pcifront_sd *sd = bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	if (verbose_request)
+		dev_info(&pdev->xdev->dev,
+			 "write dev=%04x:%02x:%02x.%u - "
+			 "offset %x size %d val %x\n",
+			 pci_domain_nr(bus), bus->number,
+			 PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
+
+	return errno_to_pcibios_err(do_pci_op(pdev, &op));
+}
+
+struct pci_ops pcifront_bus_ops = {
+	.read = pcifront_bus_read,
+	.write = pcifront_bus_write,
+};
+
+#ifdef CONFIG_PCI_MSI
+int pci_frontend_enable_msix(struct pci_dev *dev,
+		struct msix_entry *entries,
+		int nvec)
+{
+	int err;
+	int i;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_enable_msix,
+		.domain = pci_domain_nr(dev->bus),
+		.bus = dev->bus->number,
+		.devfn = dev->devfn,
+		.value = nvec,
+	};
+	struct pcifront_sd *sd = dev->bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	if (nvec > SH_INFO_MAX_VEC) {
+		pr_warning("too many vectors (%#x) for pci frontend\n", nvec);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < nvec; i++) {
+		op.msix_entries[i].entry = entries[i].entry;
+		op.msix_entries[i].vector = entries[i].vector;
+	}
+
+	err = do_pci_op(pdev, &op);
+
+	if (!err) {
+		if (!op.value) {
+			/* we get the result */
+			for ( i = 0; i < nvec; i++)
+				entries[i].vector = op.msix_entries[i].vector;
+			return 0;
+		}
+		else {
+			pr_err("enable msix get value %#x\n", op.value);
+			return op.value;
+		}
+	}
+	else {
+		pr_err("enable msix err %#x\n", err);
+		return err;
+	}
+}
+
+void pci_frontend_disable_msix(struct pci_dev* dev)
+{
+	int err;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_disable_msix,
+		.domain = pci_domain_nr(dev->bus),
+		.bus = dev->bus->number,
+		.devfn = dev->devfn,
+	};
+	struct pcifront_sd *sd = dev->bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	err = do_pci_op(pdev, &op);
+
+	/* What should do for error ? */
+	if (err)
+		pr_err("disable msix err %#x\n", err);
+}
+
+int pci_frontend_enable_msi(struct pci_dev *dev)
+{
+	int err;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_enable_msi,
+		.domain = pci_domain_nr(dev->bus),
+		.bus = dev->bus->number,
+		.devfn = dev->devfn,
+	};
+	struct pcifront_sd *sd = dev->bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	err = do_pci_op(pdev, &op);
+	if (likely(!err)) {
+		dev->irq = op.value;
+	}
+	else {
+		pr_err("pci frontend enable msi failed for dev %x:%x\n",
+		       op.bus, op.devfn);
+		err = -EINVAL;
+	}
+	return err;
+}
+
+void pci_frontend_disable_msi(struct pci_dev* dev)
+{
+	int err;
+	struct xen_pci_op op = {
+		.cmd    = XEN_PCI_OP_disable_msi,
+		.domain = pci_domain_nr(dev->bus),
+		.bus = dev->bus->number,
+		.devfn = dev->devfn,
+	};
+	struct pcifront_sd *sd = dev->bus->sysdata;
+	struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+	err = do_pci_op(pdev, &op);
+	if (err == XEN_PCI_ERR_dev_not_found) {
+		/* XXX No response from backend, what shall we do? */
+		pr_err("no response from backend for disable MSI\n");
+		return;
+	}
+	if (likely(!err))
+		dev->irq = op.value;
+	else
+		/* how can pciback notify us fail? */
+		pr_err("got bogus response from backend\n");
+}
+#endif /* CONFIG_PCI_MSI */
+
+/* Claim resources for the PCI frontend as-is, backend won't allow changes */
+static int __devinit pcifront_claim_resource(struct pci_dev *dev, void *data)
+{
+	struct pcifront_device *pdev = data;
+	int i;
+	struct resource *r;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		r = &dev->resource[i];
+
+		if (!r->parent && r->start && r->flags) {
+			dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
+				pci_name(dev), i);
+			pci_claim_resource(dev, i);
+		}
+	}
+
+	return 0;
+}
+
+int __devinit pcifront_scan_root(struct pcifront_device *pdev,
+				 unsigned int domain, unsigned int bus)
+{
+	struct pci_bus *b;
+	struct pcifront_sd *sd = NULL;
+	struct pci_bus_entry *bus_entry = NULL;
+	int err = 0;
+
+#ifndef CONFIG_PCI_DOMAINS
+	if (domain != 0) {
+		dev_err(&pdev->xdev->dev,
+			"PCI Root in non-zero PCI Domain! domain=%d\n", domain);
+		dev_err(&pdev->xdev->dev,
+			"Please compile with CONFIG_PCI_DOMAINS\n");
+		err = -EINVAL;
+		goto err_out;
+	}
+#endif
+
+	dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
+		 domain, bus);
+
+	bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
+	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+	if (!bus_entry || !sd) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	pcifront_init_sd(sd, domain, bus, pdev);
+
+	b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
+				  &pcifront_bus_ops, sd);
+	if (!b) {
+		dev_err(&pdev->xdev->dev,
+			"Error creating PCI Frontend Bus!\n");
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	pcifront_setup_root_resources(b, sd);
+	bus_entry->bus = b;
+
+	list_add(&bus_entry->list, &pdev->root_buses);
+
+	/* Claim resources before going "live" with our devices */
+	pci_walk_bus(b, pcifront_claim_resource, pdev);
+
+	pci_bus_add_devices(b);
+
+	return 0;
+
+      err_out:
+	kfree(bus_entry);
+	kfree(sd);
+
+	return err;
+}
+
+int __devinit pcifront_rescan_root(struct pcifront_device *pdev,
+				   unsigned int domain, unsigned int bus)
+{
+	struct pci_bus *b;
+	struct pci_dev *d;
+	unsigned int devfn;
+
+#ifndef CONFIG_PCI_DOMAINS
+	if (domain != 0) {
+		dev_err(&pdev->xdev->dev,
+			"PCI Root in non-zero PCI Domain! domain=%d\n", domain);
+		dev_err(&pdev->xdev->dev,
+			"Please compile with CONFIG_PCI_DOMAINS\n");
+		return -EINVAL;
+	}
+#endif
+
+	dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
+		 domain, bus);
+
+	b = pci_find_bus(domain, bus);
+	if(!b)
+		/* If the bus is unknown, create it. */
+		return pcifront_scan_root(pdev, domain, bus);
+
+	/* Rescan the bus for newly attached functions and add.
+	 * We omit handling of PCI bridge attachment because pciback prevents
+	 * bridges from being exported.
+	 */ 
+	for (devfn = 0; devfn < 0x100; devfn++) {
+		d = pci_get_slot(b, devfn);
+		if(d) {
+			/* Device is already known. */
+			pci_dev_put(d);
+			continue;
+		}
+
+		d = pci_scan_single_device(b, devfn);
+		if (d)
+			dev_info(&pdev->xdev->dev, "New device on "
+				 "%04x:%02x:%02x.%u found.\n", domain, bus,
+				 PCI_SLOT(devfn), PCI_FUNC(devfn));
+	}
+
+	/* Claim resources before going "live" with our devices */
+	pci_walk_bus(b, pcifront_claim_resource, pdev);
+
+	/* Create SysFS and notify udev of the devices. Aka: "going live" */
+	pci_bus_add_devices(b);
+
+	return 0;
+}
+
+static void free_root_bus_devs(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	while (!list_empty(&bus->devices)) {
+		dev = container_of(bus->devices.next, struct pci_dev,
+				   bus_list);
+		dev_dbg(&dev->dev, "removing device\n");
+		pci_remove_bus_device(dev);
+	}
+}
+
+void pcifront_free_roots(struct pcifront_device *pdev)
+{
+	struct pci_bus_entry *bus_entry, *t;
+
+	dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
+
+	list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
+		list_del(&bus_entry->list);
+
+		free_root_bus_devs(bus_entry->bus);
+
+		kfree(bus_entry->bus->sysdata);
+
+		device_unregister(bus_entry->bus->bridge);
+		pci_remove_bus(bus_entry->bus);
+
+		kfree(bus_entry);
+	}
+}
+
+static pci_ers_result_t pcifront_common_process( int cmd, struct pcifront_device *pdev,
+	pci_channel_state_t state)
+{
+	pci_ers_result_t result;
+	struct pci_driver *pdrv;
+	int bus = pdev->sh_info->aer_op.bus;
+	int devfn = pdev->sh_info->aer_op.devfn;
+	struct pci_dev *pcidev;
+	int flag = 0;
+
+	dev_dbg(&pdev->xdev->dev, 
+		"pcifront AER process: cmd %x (bus:%x, devfn%x)",
+		cmd, bus, devfn);
+	result = PCI_ERS_RESULT_NONE;
+
+	pcidev = pci_get_bus_and_slot(bus, devfn);
+	if (!pcidev || !pcidev->driver) {
+		pci_dev_put(pcidev);
+		dev_err(&pdev->xdev->dev, "AER device or driver is NULL\n");
+		return result;
+	}
+	pdrv = pcidev->driver;
+
+	if (get_driver(&pdrv->driver)) {
+		if (pdrv->err_handler && pdrv->err_handler->error_detected) {
+			dev_dbg(&pcidev->dev,
+				"trying to call AER service\n");
+			if (pcidev) {
+				flag = 1;
+				switch(cmd) {
+				case XEN_PCI_OP_aer_detected:
+					result = pdrv->err_handler->error_detected(pcidev, state);
+					break;
+				case XEN_PCI_OP_aer_mmio:
+					result = pdrv->err_handler->mmio_enabled(pcidev);
+					break;
+				case XEN_PCI_OP_aer_slotreset:
+					result = pdrv->err_handler->slot_reset(pcidev);
+					break;
+				case XEN_PCI_OP_aer_resume:
+					pdrv->err_handler->resume(pcidev);
+					break;
+				default:
+					dev_err(&pdev->xdev->dev,
+						"bad request in aer recovery operation!\n");
+
+				}
+			}
+		}
+		put_driver(&pdrv->driver);
+	}
+	if (!flag)
+		result = PCI_ERS_RESULT_NONE;
+
+	return result;
+}
+
+
+void pcifront_do_aer(struct work_struct *data)
+{
+	struct pcifront_device *pdev = container_of(data, struct pcifront_device, op_work);
+	int cmd = pdev->sh_info->aer_op.cmd;
+	pci_channel_state_t state = 
+		(pci_channel_state_t)pdev->sh_info->aer_op.err;
+
+	/*If a pci_conf op is in progress, 
+		we have to wait until it is done before service aer op*/
+	dev_dbg(&pdev->xdev->dev, 
+		"pcifront service aer bus %x devfn %x\n", pdev->sh_info->aer_op.bus,
+		pdev->sh_info->aer_op.devfn);
+
+	pdev->sh_info->aer_op.err = pcifront_common_process(cmd, pdev, state);
+
+	wmb();
+	clear_bit(_XEN_PCIB_active, (unsigned long*)&pdev->sh_info->flags);
+	notify_remote_via_evtchn(pdev->evtchn);
+
+	/*in case of we lost an aer request in four lines time_window*/
+	smp_mb__before_clear_bit();
+	clear_bit( _PDEVB_op_active, &pdev->flags);
+	smp_mb__after_clear_bit();
+
+	schedule_pcifront_aer_op(pdev);
+
+}
+
+irqreturn_t pcifront_handler_aer(int irq, void *dev)
+{
+	struct pcifront_device *pdev = dev;
+	schedule_pcifront_aer_op(pdev);
+	return IRQ_HANDLED;
+}
diff --git a/drivers/xen/pcifront/pcifront.h b/drivers/xen/pcifront/pcifront.h
new file mode 100644
index 0000000..2732ffd
--- /dev/null
+++ b/drivers/xen/pcifront/pcifront.h
@@ -0,0 +1,57 @@
+/*
+ * PCI Frontend - Common data structures & function declarations
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIFRONT_H__
+#define __XEN_PCIFRONT_H__
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/pciif.h>
+#include <linux/interrupt.h>
+#include <xen/pcifront.h>
+#include <linux/atomic.h>
+#include <linux/workqueue.h>
+
+struct pci_bus_entry {
+	struct list_head list;
+	struct pci_bus *bus;
+};
+
+#define _PDEVB_op_active		(0)
+#define PDEVB_op_active 		(1 << (_PDEVB_op_active))
+
+struct pcifront_device {
+	struct xenbus_device *xdev;
+	struct list_head root_buses;
+	spinlock_t dev_lock;
+
+	int evtchn;
+	int gnt_ref;
+	int irq;
+
+	/* Lock this when doing any operations in sh_info */
+	spinlock_t sh_info_lock;
+	struct xen_pci_sharedinfo *sh_info;
+	struct work_struct op_work;
+	unsigned long flags;
+
+};
+
+int pcifront_connect(struct pcifront_device *pdev);
+void pcifront_disconnect(struct pcifront_device *pdev);
+
+int pcifront_scan_root(struct pcifront_device *pdev,
+		       unsigned int domain, unsigned int bus);
+int pcifront_rescan_root(struct pcifront_device *pdev,
+			 unsigned int domain, unsigned int bus);
+void pcifront_free_roots(struct pcifront_device *pdev);
+
+void pcifront_do_aer(struct work_struct *data);
+
+irqreturn_t pcifront_handler_aer(int irq, void *dev);
+
+#endif	/* __XEN_PCIFRONT_H__ */
diff --git a/drivers/xen/pcifront/xenbus.c b/drivers/xen/pcifront/xenbus.c
new file mode 100644
index 0000000..f9e569e
--- /dev/null
+++ b/drivers/xen/pcifront/xenbus.c
@@ -0,0 +1,476 @@
+/*
+ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include "pcifront.h"
+
+#define INVALID_GRANT_REF (0)
+#define INVALID_EVTCHN    (-1)
+
+static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
+{
+	struct pcifront_device *pdev;
+
+	pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL);
+	if (pdev == NULL)
+		goto out;
+
+	pdev->sh_info =
+	    (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
+	if (pdev->sh_info == NULL) {
+		kfree(pdev);
+		pdev = NULL;
+		goto out;
+	}
+	pdev->sh_info->flags = 0;
+
+	/*Flag for registering PV AER handler*/
+	set_bit(_XEN_PCIB_AERHANDLER, (void*)&pdev->sh_info->flags);
+
+	dev_set_drvdata(&xdev->dev, pdev);
+	pdev->xdev = xdev;
+
+	INIT_LIST_HEAD(&pdev->root_buses);
+
+	spin_lock_init(&pdev->dev_lock);
+	spin_lock_init(&pdev->sh_info_lock);
+
+	pdev->evtchn = INVALID_EVTCHN;
+	pdev->gnt_ref = INVALID_GRANT_REF;
+	pdev->irq = -1;
+
+	INIT_WORK(&pdev->op_work, pcifront_do_aer);
+
+	dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
+		pdev, pdev->sh_info);
+      out:
+	return pdev;
+}
+
+static void free_pdev(struct pcifront_device *pdev)
+{
+	dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
+
+	pcifront_free_roots(pdev);
+
+	/*For PCIE_AER error handling job*/
+	flush_work_sync(&pdev->op_work);
+
+	if (pdev->irq > 0)
+		unbind_from_irqhandler(pdev->irq, pdev);
+
+	if (pdev->evtchn != INVALID_EVTCHN)
+		xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
+
+	if (pdev->gnt_ref != INVALID_GRANT_REF)
+		gnttab_end_foreign_access(pdev->gnt_ref,
+					  (unsigned long)pdev->sh_info);
+	else
+		free_page((unsigned long)pdev->sh_info);
+
+	dev_set_drvdata(&pdev->xdev->dev, NULL);
+
+	kfree(pdev);
+}
+
+static int pcifront_publish_info(struct pcifront_device *pdev)
+{
+	int err = 0;
+	struct xenbus_transaction trans;
+
+	err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
+	if (err < 0)
+		goto out;
+
+	pdev->gnt_ref = err;
+
+	err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
+	if (err)
+		goto out;
+
+	err = bind_caller_port_to_irqhandler(pdev->evtchn,
+					     pcifront_handler_aer,
+					     IRQF_SAMPLE_RANDOM,
+					     "pcifront", pdev);
+	if (err < 0) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Failed to bind event channel");
+		goto out;
+	}
+	pdev->irq = err;
+
+      do_publish:
+	err = xenbus_transaction_start(&trans);
+	if (err) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error writing configuration for backend "
+				 "(start transaction)");
+		goto out;
+	}
+
+	err = xenbus_printf(trans, pdev->xdev->nodename,
+			    "pci-op-ref", "%u", pdev->gnt_ref);
+	if (!err)
+		err = xenbus_printf(trans, pdev->xdev->nodename,
+				    "event-channel", "%u", pdev->evtchn);
+	if (!err)
+		err = xenbus_printf(trans, pdev->xdev->nodename,
+				    "magic", XEN_PCI_MAGIC);
+
+	if (err) {
+		xenbus_transaction_end(trans, 1);
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error writing configuration for backend");
+		goto out;
+	} else {
+		err = xenbus_transaction_end(trans, 0);
+		if (err == -EAGAIN)
+			goto do_publish;
+		else if (err) {
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error completing transaction "
+					 "for backend");
+			goto out;
+		}
+	}
+
+	xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+
+	dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
+
+      out:
+	return err;
+}
+
+static int __devinit pcifront_try_connect(struct pcifront_device *pdev)
+{
+	int err = -EFAULT;
+	int i, num_roots, len;
+	char str[64];
+	unsigned int domain, bus;
+
+	spin_lock(&pdev->dev_lock);
+
+	/* Only connect once */
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateInitialised)
+		goto out;
+
+	err = pcifront_connect(pdev);
+	if (err) {
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error connecting PCI Frontend");
+		goto out;
+	}
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+			   "root_num", "%d", &num_roots);
+	if (err == -ENOENT) {
+		xenbus_dev_error(pdev->xdev, err,
+				 "No PCI Roots found, trying 0000:00");
+		err = pcifront_scan_root(pdev, 0, 0);
+		num_roots = 0;
+	} else if (err != 1) {
+		if (err == 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of PCI roots");
+		goto out;
+	}
+
+	for (i = 0; i < num_roots; i++) {
+		len = snprintf(str, sizeof(str), "root-%d", i);
+		if (unlikely(len >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+				   "%x:%x", &domain, &bus);
+		if (err != 2) {
+			if (err >= 0)
+				err = -EINVAL;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error reading PCI root %d", i);
+			goto out;
+		}
+
+		err = pcifront_scan_root(pdev, domain, bus);
+		if (err) {
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error scanning PCI root %04x:%02x",
+					 domain, bus);
+			goto out;
+		}
+	}
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+	if (err)
+		goto out;
+
+      out:
+	spin_unlock(&pdev->dev_lock);
+	return err;
+}
+
+static int pcifront_try_disconnect(struct pcifront_device *pdev)
+{
+	int err = 0;
+	enum xenbus_state prev_state;
+
+	spin_lock(&pdev->dev_lock);
+
+	prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
+
+	if (prev_state >= XenbusStateClosing)
+		goto out;
+
+	if(prev_state == XenbusStateConnected) {
+		pcifront_free_roots(pdev);
+		pcifront_disconnect(pdev);
+	}
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateClosed);
+
+      out:
+	spin_unlock(&pdev->dev_lock);
+
+	return err;
+}
+
+static int __devinit pcifront_attach_devices(struct pcifront_device *pdev)
+{
+	int err = -EFAULT;
+	int i, num_roots, len;
+	unsigned int domain, bus;
+	char str[64];
+
+	spin_lock(&pdev->dev_lock);
+
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateReconfiguring)
+		goto out;
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+			   "root_num", "%d", &num_roots);
+	if (err == -ENOENT) {
+		xenbus_dev_error(pdev->xdev, err,
+				 "No PCI Roots found, trying 0000:00");
+		err = pcifront_rescan_root(pdev, 0, 0);
+		num_roots = 0;
+	} else if (err != 1) {
+		if (err == 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of PCI roots");
+		goto out;
+	}
+
+	for (i = 0; i < num_roots; i++) {
+		len = snprintf(str, sizeof(str), "root-%d", i);
+		if (unlikely(len >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+				   "%x:%x", &domain, &bus);
+		if (err != 2) {
+			if (err >= 0)
+				err = -EINVAL;
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error reading PCI root %d", i);
+			goto out;
+		}
+
+		err = pcifront_rescan_root(pdev, domain, bus);
+		if (err) {
+			xenbus_dev_fatal(pdev->xdev, err,
+					 "Error scanning PCI root %04x:%02x",
+					 domain, bus);
+			goto out;
+		}
+	}
+
+	xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+
+      out:
+	spin_unlock(&pdev->dev_lock);
+	return err;
+}
+
+static int pcifront_detach_devices(struct pcifront_device *pdev)
+{
+	int err = 0;
+	int i, num_devs;
+	unsigned int domain, bus, slot, func;
+	struct pci_bus *pci_bus;
+	struct pci_dev *pci_dev;
+	char str[64];
+
+	spin_lock(&pdev->dev_lock);
+
+	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+	    XenbusStateConnected)
+		goto out;
+
+	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
+			   &num_devs);
+	if (err != 1) {
+		if (err >= 0)
+			err = -EINVAL;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Error reading number of PCI devices");
+		goto out;
+	}
+
+	/* Find devices being detached and remove them. */
+	for (i = 0; i < num_devs; i++) {
+		int l, state;
+		l = snprintf(str, sizeof(str), "state-%d", i);
+		if (unlikely(l >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d",
+				   &state);
+		if (err != 1)
+			state = XenbusStateUnknown;
+
+		if (state != XenbusStateClosing)
+			continue;
+
+		/* Remove device. */
+		l = snprintf(str, sizeof(str), "vdev-%d", i);
+		if (unlikely(l >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+			   	   "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+		if (err != 4) {
+			if (err >= 0)
+				err = -EINVAL;
+			xenbus_dev_fatal(pdev->xdev, err,
+				 	 "Error reading PCI device %d", i);
+			goto out;
+		}
+
+		pci_bus = pci_find_bus(domain, bus);
+		if(!pci_bus) {
+			dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n",
+				domain, bus);
+			continue;
+		}
+		pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func));
+		if(!pci_dev) {
+			dev_dbg(&pdev->xdev->dev,
+				"Cannot get PCI device %04x:%02x:%02x.%u\n",
+				domain, bus, slot, func);
+			continue;
+		}
+		pci_remove_bus_device(pci_dev);
+		pci_dev_put(pci_dev);
+
+		dev_dbg(&pdev->xdev->dev,
+			"PCI device %04x:%02x:%02x.%u removed.\n",
+			domain, bus, slot, func);
+	}
+
+	err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
+
+      out:
+	spin_unlock(&pdev->dev_lock);
+	return err;
+}
+
+static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev,
+						  enum xenbus_state be_state)
+{
+	struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
+
+	switch (be_state) {
+	case XenbusStateUnknown:
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateConnected:
+		pcifront_try_connect(pdev);
+		break;
+
+	case XenbusStateClosing:
+		dev_warn(&xdev->dev, "backend going away!\n");
+		pcifront_try_disconnect(pdev);
+		break;
+
+	case XenbusStateReconfiguring:
+		pcifront_detach_devices(pdev);
+		break;
+
+	case XenbusStateReconfigured:
+		pcifront_attach_devices(pdev);
+		break;
+	}
+}
+
+static int pcifront_xenbus_probe(struct xenbus_device *xdev,
+				 const struct xenbus_device_id *id)
+{
+	int err = 0;
+	struct pcifront_device *pdev = alloc_pdev(xdev);
+
+	if (pdev == NULL) {
+		err = -ENOMEM;
+		xenbus_dev_fatal(xdev, err,
+				 "Error allocating pcifront_device struct");
+		goto out;
+	}
+
+	err = pcifront_publish_info(pdev);
+	if (err)
+		free_pdev(pdev);
+
+      out:
+	return err;
+}
+
+static int pcifront_xenbus_remove(struct xenbus_device *xdev)
+{
+	if (dev_get_drvdata(&xdev->dev))
+		free_pdev(dev_get_drvdata(&xdev->dev));
+
+	return 0;
+}
+
+static const struct xenbus_device_id pcifront_ids[] = {
+	{"pci"},
+	{{0}},
+};
+MODULE_ALIAS("xen:pci");
+
+static DEFINE_XENBUS_DRIVER(pcifront, "pcifront",
+	.probe 			= pcifront_xenbus_probe,
+	.remove 		= pcifront_xenbus_remove,
+	.otherend_changed 	= pcifront_backend_changed,
+);
+
+static int __init pcifront_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	return xenbus_register_frontend(&pcifront_driver);
+}
+
+/* Initialize after the Xen PCI Frontend Stub is initialized */
+subsys_initcall(pcifront_init);
diff --git a/drivers/xen/privcmd/Makefile b/drivers/xen/privcmd/Makefile
new file mode 100644
index 0000000..507245a
--- /dev/null
+++ b/drivers/xen/privcmd/Makefile
@@ -0,0 +1,3 @@
+priv-$(CONFIG_COMPAT) := compat_privcmd.o
+obj-y := privcmd.o
+obj-$(CONFIG_XEN_PRIVILEGED_GUEST) += $(priv-y)
diff --git a/drivers/xen/privcmd/compat_privcmd.c b/drivers/xen/privcmd/compat_privcmd.c
new file mode 100644
index 0000000..3e5c077
--- /dev/null
+++ b/drivers/xen/privcmd/compat_privcmd.c
@@ -0,0 +1,140 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Copyright (C) IBM Corp. 2006
+ *
+ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
+ */
+
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+#include <linux/syscalls.h>
+#include <asm/hypervisor.h>
+#include <asm/uaccess.h>
+#include <xen/public/privcmd.h>
+#include <xen/compat_ioctl.h>
+
+int privcmd_ioctl_32(int fd, unsigned int cmd, void __user *arg)
+{
+	int ret;
+
+	switch (cmd) {
+	case IOCTL_PRIVCMD_MMAP_32: {
+		struct privcmd_mmap __user *p;
+		struct privcmd_mmap_32 __user *p32 = arg;
+		struct privcmd_mmap_32 n32;
+
+		p = compat_alloc_user_space(sizeof(*p));
+		if (copy_from_user(&n32, p32, sizeof(n32)) ||
+		    put_user(n32.num, &p->num) ||
+		    put_user(n32.dom, &p->dom) ||
+		    put_user(compat_ptr(n32.entry), &p->entry))
+			return -EFAULT;
+		
+		ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAP, (unsigned long)p);
+	}
+		break;
+	case IOCTL_PRIVCMD_MMAPBATCH_32: {
+		struct privcmd_mmapbatch __user *p;
+		struct privcmd_mmapbatch_32 __user *p32 = arg;
+		struct privcmd_mmapbatch_32 n32;
+#ifdef xen_pfn32_t
+		xen_pfn_t *__user arr;
+		xen_pfn32_t *__user arr32;
+		unsigned int i;
+#endif
+
+		p = compat_alloc_user_space(sizeof(*p));
+		if (copy_from_user(&n32, p32, sizeof(n32)) ||
+		    put_user(n32.num, &p->num) ||
+		    put_user(n32.dom, &p->dom) ||
+		    put_user(n32.addr, &p->addr))
+			return -EFAULT;
+#ifdef xen_pfn32_t
+		arr = compat_alloc_user_space(n32.num * sizeof(*arr)
+					      + sizeof(*p));
+		arr32 = compat_ptr(n32.arr);
+		for (i = 0; i < n32.num; ++i) {
+			xen_pfn32_t mfn;
+
+			if (get_user(mfn, arr32 + i) || put_user(mfn, arr + i))
+				return -EFAULT;
+		}
+
+		if (put_user(arr, &p->arr))
+			return -EFAULT;
+#else
+		if (put_user(compat_ptr(n32.arr), &p->arr))
+			return -EFAULT;
+#endif
+		
+		ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAPBATCH, (unsigned long)p);
+
+#ifdef xen_pfn32_t
+		for (i = 0; !ret && i < n32.num; ++i) {
+			xen_pfn_t mfn;
+
+			if (get_user(mfn, arr + i) || put_user(mfn, arr32 + i))
+				ret = -EFAULT;
+			else if (mfn != (xen_pfn32_t)mfn)
+				ret = -ERANGE;
+		}
+#endif
+	}
+		break;
+	case IOCTL_PRIVCMD_MMAPBATCH_V2_32: {
+		struct privcmd_mmapbatch_v2 __user *p;
+		struct privcmd_mmapbatch_v2_32 __user *p32 = arg;
+		struct privcmd_mmapbatch_v2_32 n32;
+#ifdef xen_pfn32_t
+		xen_pfn_t *__user arr;
+		const xen_pfn32_t *__user arr32;
+		unsigned int i;
+#endif
+
+		p = compat_alloc_user_space(sizeof(*p));
+		if (copy_from_user(&n32, p32, sizeof(n32)) ||
+		    put_user(n32.num, &p->num) ||
+		    put_user(n32.dom, &p->dom) ||
+		    put_user(n32.addr, &p->addr) ||
+		    put_user(compat_ptr(n32.err), &p->err))
+			return -EFAULT;
+#ifdef xen_pfn32_t
+		arr = compat_alloc_user_space(n32.num * sizeof(*arr)
+					      + sizeof(*p));
+		arr32 = compat_ptr(n32.arr);
+		for (i = 0; i < n32.num; ++i) {
+			xen_pfn32_t mfn;
+
+			if (get_user(mfn, arr32 + i) || put_user(mfn, arr + i))
+				return -EFAULT;
+		}
+
+		if (put_user(arr, &p->arr))
+			return -EFAULT;
+#else
+		if (put_user(compat_ptr(n32.arr), &p->arr))
+			return -EFAULT;
+#endif
+
+		ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAPBATCH_V2, (unsigned long)p);
+	}
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
diff --git a/drivers/xen/privcmd/privcmd.c b/drivers/xen/privcmd/privcmd.c
new file mode 100644
index 0000000..e14e4e2
--- /dev/null
+++ b/drivers/xen/privcmd/privcmd.c
@@ -0,0 +1,470 @@
+/******************************************************************************
+ * privcmd.c
+ * 
+ * Interface to privileged domain-0 commands.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <asm/hypervisor.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/hypervisor.h>
+#include <xen/public/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/xen_proc.h>
+#include <xen/features.h>
+
+static struct proc_dir_entry *privcmd_intf;
+static struct proc_dir_entry *capabilities_intf;
+
+#ifndef CONFIG_XEN_PRIVILEGED_GUEST
+#define HAVE_ARCH_PRIVCMD_MMAP
+#endif
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int enforce_singleshot_mapping_fn(pte_t *pte, struct page *pmd_page,
+					 unsigned long addr, void *data)
+{
+	return pte_none(*pte) ? 0 : -EBUSY;
+}
+
+static inline int enforce_singleshot_mapping(struct vm_area_struct *vma,
+					     unsigned long addr,
+					     unsigned long npages)
+{
+	return apply_to_page_range(vma->vm_mm, addr, npages << PAGE_SHIFT,
+				   enforce_singleshot_mapping_fn, NULL) == 0;
+}
+#else
+#define enforce_singleshot_mapping(vma, addr, npages) \
+	privcmd_enforce_singleshot_mapping(vma)
+#endif
+
+static long privcmd_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long data)
+{
+	long ret;
+	void __user *udata = (void __user *) data;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+	unsigned long i, addr, nr, nr_pages;
+	int paged_out;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	LIST_HEAD(pagelist);
+	struct list_head *l, *l2;
+#endif
+
+	switch (cmd) {
+	case IOCTL_PRIVCMD_HYPERCALL: {
+		privcmd_hypercall_t hypercall;
+  
+		if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+			return -EFAULT;
+
+#ifdef CONFIG_X86
+		ret = -ENOSYS;
+		if (hypercall.op >= (PAGE_SIZE >> 5))
+			break;
+		ret = _hypercall(long, (unsigned int)hypercall.op,
+				 (unsigned long)hypercall.arg[0],
+				 (unsigned long)hypercall.arg[1],
+				 (unsigned long)hypercall.arg[2],
+				 (unsigned long)hypercall.arg[3],
+				 (unsigned long)hypercall.arg[4]);
+#else
+		ret = privcmd_hypercall(&hypercall);
+#endif
+	}
+	break;
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+
+	case IOCTL_PRIVCMD_MMAP: {
+#define MMAP_NR_PER_PAGE \
+	(unsigned long)((PAGE_SIZE - sizeof(*l)) / sizeof(*msg))
+		privcmd_mmap_t mmapcmd;
+		privcmd_mmap_entry_t *msg;
+		privcmd_mmap_entry_t __user *p;
+
+		if (!is_initial_xendomain())
+			return -EPERM;
+
+		if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+			return -EFAULT;
+
+		if (mmapcmd.num <= 0)
+			return -EINVAL;
+
+		p = mmapcmd.entry;
+		for (i = 0; i < mmapcmd.num;) {
+			nr = min(mmapcmd.num - i, MMAP_NR_PER_PAGE);
+
+			ret = -ENOMEM;
+			l = (struct list_head *) __get_free_page(GFP_KERNEL);
+			if (l == NULL)
+				goto mmap_out;
+
+			INIT_LIST_HEAD(l);
+			list_add_tail(l, &pagelist);
+			msg = (privcmd_mmap_entry_t*)(l + 1);
+
+			ret = -EFAULT;
+			if (copy_from_user(msg, p, nr*sizeof(*msg)))
+				goto mmap_out;
+			i += nr;
+			p += nr;
+		}
+
+		l = pagelist.next;
+		msg = (privcmd_mmap_entry_t*)(l + 1);
+
+		down_write(&mm->mmap_sem);
+
+		vma = find_vma(mm, msg->va);
+		ret = -EINVAL;
+		if (!vma || (msg->va != vma->vm_start))
+			goto mmap_out;
+
+		addr = vma->vm_start;
+
+		i = 0;
+		list_for_each(l, &pagelist) {
+			nr = i + min(mmapcmd.num - i, MMAP_NR_PER_PAGE);
+
+			msg = (privcmd_mmap_entry_t*)(l + 1);
+			while (i<nr) {
+
+				/* Do not allow range to wrap the address space. */
+				if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+				    (((unsigned long)msg->npages << PAGE_SHIFT) >= -addr))
+					goto mmap_out;
+
+				/* Range chunks must be contiguous in va space. */
+				if ((msg->va != addr) ||
+				    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+					goto mmap_out;
+
+				addr += msg->npages << PAGE_SHIFT;
+				msg++;
+				i++;
+			}
+		}
+
+		if (!enforce_singleshot_mapping(vma, vma->vm_start,
+						(addr - vma->vm_start) >> PAGE_SHIFT))
+			goto mmap_out;
+
+		addr = vma->vm_start;
+		i = 0;
+		list_for_each(l, &pagelist) {
+			nr = i + min(mmapcmd.num - i, MMAP_NR_PER_PAGE);
+
+			msg = (privcmd_mmap_entry_t*)(l + 1);
+			while (i < nr) {
+				if ((ret = direct_remap_pfn_range(
+					     vma,
+					     msg->va & PAGE_MASK,
+					     msg->mfn,
+					     msg->npages << PAGE_SHIFT,
+					     vma->vm_page_prot,
+					     mmapcmd.dom)) < 0)
+					goto mmap_out;
+
+				addr += msg->npages << PAGE_SHIFT;
+				msg++;
+				i++;
+			}
+		}
+
+		ret = 0;
+
+	mmap_out:
+		up_write(&mm->mmap_sem);
+		list_for_each_safe(l,l2,&pagelist)
+			free_page((unsigned long)l);
+	}
+#undef MMAP_NR_PER_PAGE
+	break;
+
+	case IOCTL_PRIVCMD_MMAPBATCH: {
+#define MMAPBATCH_NR_PER_PAGE \
+	(unsigned long)((PAGE_SIZE - sizeof(*l)) / sizeof(*mfn))
+		privcmd_mmapbatch_t m;
+		xen_pfn_t __user *p;
+		xen_pfn_t *mfn;
+
+		if (!is_initial_xendomain())
+			return -EPERM;
+
+		if (copy_from_user(&m, udata, sizeof(m)))
+			return -EFAULT;
+
+		nr_pages = m.num;
+		addr = m.addr;
+		if (m.num <= 0 || nr_pages > (LONG_MAX >> PAGE_SHIFT) ||
+		    addr != m.addr || nr_pages > (-addr >> PAGE_SHIFT))
+			return -EINVAL;
+
+		p = m.arr;
+		for (i=0; i<nr_pages; )	{
+			nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+
+			ret = -ENOMEM;
+			l = (struct list_head *)__get_free_page(GFP_KERNEL);
+			if (l == NULL)
+				goto mmapbatch_out;
+
+			INIT_LIST_HEAD(l);
+			list_add_tail(l, &pagelist);
+
+			mfn = (unsigned long*)(l + 1);
+			ret = -EFAULT;
+			if (copy_from_user(mfn, p, nr*sizeof(*mfn)))
+				goto mmapbatch_out;
+
+			i += nr; p+= nr;
+		}
+
+		down_write(&mm->mmap_sem);
+
+		vma = find_vma(mm, addr);
+		ret = -EINVAL;
+		if (!vma ||
+		    addr < vma->vm_start ||
+		    addr + (nr_pages << PAGE_SHIFT) > vma->vm_end ||
+		    !enforce_singleshot_mapping(vma, addr, nr_pages)) {
+			up_write(&mm->mmap_sem);
+			goto mmapbatch_out;
+		}
+
+		i = 0;
+		ret = 0;
+		paged_out = 0;
+		list_for_each(l, &pagelist) {
+			nr = i + min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+			mfn = (unsigned long *)(l + 1);
+
+			while (i<nr) {
+				int rc;
+
+				rc = direct_remap_pfn_range(vma, addr & PAGE_MASK,
+				                            *mfn, PAGE_SIZE,
+				                            vma->vm_page_prot, m.dom);
+				if(rc < 0) {
+					if (rc == -ENOENT)
+					{
+						*mfn |= 0x80000000U;
+						paged_out = 1;
+					}
+					else
+						*mfn |= 0xf0000000U;
+					ret++;
+				}
+				mfn++; i++; addr += PAGE_SIZE;
+			}
+		}
+
+		up_write(&mm->mmap_sem);
+		if (ret > 0) {
+			p = m.arr;
+			i = 0;
+			if (paged_out)
+				ret = -ENOENT;
+			else
+				ret = 0;
+			list_for_each(l, &pagelist) {
+				nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+				mfn = (unsigned long *)(l + 1);
+				if (copy_to_user(p, mfn, nr*sizeof(*mfn)))
+					ret = -EFAULT;
+				i += nr; p += nr;
+			}
+		}
+	mmapbatch_out:
+		list_for_each_safe(l,l2,&pagelist)
+			free_page((unsigned long)l);
+	}
+	break;
+
+	case IOCTL_PRIVCMD_MMAPBATCH_V2: {
+		privcmd_mmapbatch_v2_t m;
+		const xen_pfn_t __user *p;
+		xen_pfn_t *mfn;
+		int *err;
+
+		if (!is_initial_xendomain())
+			return -EPERM;
+
+		if (copy_from_user(&m, udata, sizeof(m)))
+			return -EFAULT;
+
+		nr_pages = m.num;
+		addr = m.addr;
+		if (m.num <= 0 || nr_pages > (ULONG_MAX >> PAGE_SHIFT) ||
+		    addr != m.addr || nr_pages > (-addr >> PAGE_SHIFT))
+			return -EINVAL;
+
+		p = m.arr;
+		for (i = 0; i < nr_pages; i += nr, p += nr) {
+			nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+
+			ret = -ENOMEM;
+			l = (struct list_head *)__get_free_page(GFP_KERNEL);
+			if (l == NULL)
+				goto mmapbatch_v2_out;
+
+			INIT_LIST_HEAD(l);
+			list_add_tail(l, &pagelist);
+
+			mfn = (void *)(l + 1);
+			ret = -EFAULT;
+			if (copy_from_user(mfn, p, nr * sizeof(*mfn)))
+				goto mmapbatch_v2_out;
+		}
+
+		down_write(&mm->mmap_sem);
+
+		vma = find_vma(mm, addr);
+		ret = -EINVAL;
+		if (!vma ||
+		    addr < vma->vm_start ||
+		    addr + (nr_pages << PAGE_SHIFT) > vma->vm_end ||
+		    !enforce_singleshot_mapping(vma, addr, nr_pages)) {
+			up_write(&mm->mmap_sem);
+			goto mmapbatch_v2_out;
+		}
+
+		i = 0;
+		ret = 0;
+		paged_out = 0;
+		list_for_each(l, &pagelist) {
+			nr = i + min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+			mfn = (void *)(l + 1);
+			err = (void *)(l + 1);
+			BUILD_BUG_ON(sizeof(*err) > sizeof(*mfn));
+
+			while (i < nr) {
+				int rc;
+
+				rc = direct_remap_pfn_range(vma, addr & PAGE_MASK,
+				                            *mfn, PAGE_SIZE,
+				                            vma->vm_page_prot, m.dom);
+				if (rc < 0) {
+					if (rc == -ENOENT)
+						paged_out = 1;
+					ret++;
+				} else
+					BUG_ON(rc > 0);
+				*err++ = rc;
+				mfn++; i++; addr += PAGE_SIZE;
+			}
+		}
+
+		up_write(&mm->mmap_sem);
+
+		if (ret > 0) {
+			int __user *p = m.err;
+
+			ret = paged_out ? -ENOENT : 0;
+			i = 0;
+			list_for_each(l, &pagelist) {
+				nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+				err = (void *)(l + 1);
+				if (copy_to_user(p, err, nr * sizeof(*err)))
+					ret = -EFAULT;
+				i += nr; p += nr;
+			}
+		} else if (clear_user(m.err, nr_pages * sizeof(*m.err)))
+			ret = -EFAULT;
+
+	mmapbatch_v2_out:
+		list_for_each_safe(l, l2, &pagelist)
+			free_page((unsigned long)l);
+#undef MMAPBATCH_NR_PER_PAGE
+	}
+	break;
+
+#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+	.fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	/* Unsupported for auto-translate guests. */
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return -ENOSYS;
+
+	/* DONTCOPY is essential for Xen because copy_page_range doesn't know
+	 * how to recreate these mappings */
+	vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY;
+	vma->vm_ops = &privcmd_vm_ops;
+	vma->vm_private_data = NULL;
+
+	return 0;
+}
+#endif
+
+static const struct file_operations privcmd_file_ops = {
+	.open = nonseekable_open,
+	.llseek = no_llseek,
+	.unlocked_ioctl = privcmd_ioctl,
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+	.mmap = privcmd_mmap,
+#endif
+};
+
+static int capabilities_read(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	int len = 0;
+	*page = 0;
+
+	if (is_initial_xendomain())
+		len = sprintf( page, "control_d\n" );
+
+	*eof = 1;
+	return len;
+}
+
+static int __init privcmd_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	privcmd_intf = create_xen_proc_entry("privcmd", 0400);
+	if (privcmd_intf != NULL)
+		privcmd_intf->proc_fops = &privcmd_file_ops;
+
+	capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
+	if (capabilities_intf != NULL)
+		capabilities_intf->read_proc = capabilities_read;
+
+	return 0;
+}
+
+__initcall(privcmd_init);
diff --git a/drivers/xen/scsiback/Makefile b/drivers/xen/scsiback/Makefile
new file mode 100644
index 0000000..56271df
--- /dev/null
+++ b/drivers/xen/scsiback/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_SCSI_BACKEND) := xen-scsibk.o
+
+xen-scsibk-y	:= interface.o scsiback.o xenbus.o translate.o emulate.o
+
diff --git a/drivers/xen/scsiback/common.h b/drivers/xen/scsiback/common.h
new file mode 100644
index 0000000..ee481d1
--- /dev/null
+++ b/drivers/xen/scsiback/common.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __SCSIIF__BACKEND__COMMON_H__
+#define __SCSIIF__BACKEND__COMMON_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/kthread.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_eh.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/vscsiif.h>
+
+
+#define DPRINTK(_f, _a...)			\
+	pr_debug("(file=%s, line=%d) " _f,	\
+		 __FILE__ , __LINE__ , ## _a )
+
+struct ids_tuple {
+	unsigned int hst;		/* host    */
+	unsigned int chn;		/* channel */
+	unsigned int tgt;		/* target  */
+	unsigned int lun;		/* LUN     */
+};
+
+struct v2p_entry {
+	struct ids_tuple v;		/* translate from */
+	struct scsi_device *sdev;	/* translate to   */
+	struct list_head l;
+};
+
+struct vscsibk_info {
+	struct xenbus_device *dev;
+
+	domid_t domid;
+	unsigned int evtchn;
+	unsigned int irq;
+
+	int feature;
+
+	struct vscsiif_back_ring  ring;
+	struct vm_struct *ring_area;
+
+	spinlock_t ring_lock;
+	atomic_t nr_unreplied_reqs;
+
+	spinlock_t v2p_lock;
+	struct list_head v2p_entry_lists;
+
+	struct task_struct *kthread;
+	wait_queue_head_t waiting_to_free;
+	wait_queue_head_t wq;
+	unsigned int waiting_reqs;
+	struct page **mmap_pages;
+
+};
+
+typedef struct {
+	unsigned char act;
+	struct vscsibk_info *info;
+	struct scsi_device *sdev;
+
+	uint16_t rqid;
+	
+	uint16_t v_chn, v_tgt;
+
+	uint8_t nr_segments;
+	uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
+	uint8_t cmd_len;
+
+	uint8_t sc_data_direction;
+	uint16_t timeout_per_command;
+	
+	uint32_t request_bufflen;
+	struct scatterlist *sgl;
+	grant_ref_t gref[VSCSIIF_SG_TABLESIZE];
+
+	int32_t rslt;
+	uint32_t resid;
+	uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+
+	struct list_head free_list;
+} pending_req_t;
+
+
+
+#define scsiback_get(_b) (atomic_inc(&(_b)->nr_unreplied_reqs))
+#define scsiback_put(_b)				\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->nr_unreplied_reqs))	\
+			wake_up(&(_b)->waiting_to_free);\
+	} while (0)
+
+#define VSCSIIF_TIMEOUT		(900*HZ)
+
+#define VSCSI_TYPE_HOST		1
+
+irqreturn_t scsiback_intr(int, void *);
+int scsiback_init_sring(struct vscsibk_info *, grant_ref_t, evtchn_port_t);
+int scsiback_schedule(void *data);
+
+
+struct vscsibk_info *vscsibk_info_alloc(domid_t domid);
+void scsiback_free(struct vscsibk_info *info);
+void scsiback_disconnect(struct vscsibk_info *);
+int __init scsiback_interface_init(void);
+void scsiback_interface_exit(void);
+int scsiback_xenbus_init(void);
+void scsiback_xenbus_unregister(void);
+
+void scsiback_init_translation_table(struct vscsibk_info *info);
+
+int scsiback_add_translation_entry(struct vscsibk_info *info,
+			struct scsi_device *sdev, struct ids_tuple *v);
+
+int scsiback_del_translation_entry(struct vscsibk_info *info,
+				struct ids_tuple *v);
+struct scsi_device *scsiback_do_translation(struct vscsibk_info *info,
+			struct ids_tuple *v);
+void scsiback_release_translation_entry(struct vscsibk_info *info);
+
+
+void scsiback_cmd_exec(pending_req_t *pending_req);
+void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result,
+			uint32_t resid, pending_req_t *pending_req);
+void scsiback_fast_flush_area(pending_req_t *req);
+
+void scsiback_rsp_emulation(pending_req_t *pending_req);
+void scsiback_req_emulation_or_cmdexec(pending_req_t *pending_req);
+void scsiback_emulation_init(void);
+
+
+#endif /* __SCSIIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/scsiback/emulate.c b/drivers/xen/scsiback/emulate.c
new file mode 100644
index 0000000..67113a7
--- /dev/null
+++ b/drivers/xen/scsiback/emulate.c
@@ -0,0 +1,480 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+* Patched to support >2TB drives + allow tape & autoloader operations
+* 2010, Samuel Kvasnica, IMS Nanofabrication AG
+*/
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include "common.h"
+
+/* Following SCSI commands are not defined in scsi/scsi.h */
+#define EXTENDED_COPY		0x83	/* EXTENDED COPY command        */
+#define REPORT_ALIASES		0xa3	/* REPORT ALIASES command       */
+#define CHANGE_ALIASES		0xa4	/* CHANGE ALIASES command       */
+#define SET_PRIORITY		0xa4	/* SET PRIORITY command         */
+
+
+/*
+  The bitmap in order to control emulation.
+  (Bit 3 to 7 are reserved for future use.)
+*/
+#define VSCSIIF_NEED_CMD_EXEC		0x01	/* If this bit is set, cmd exec	*/
+						/* is required.			*/
+#define VSCSIIF_NEED_EMULATE_REQBUF	0x02	/* If this bit is set, need	*/
+						/* emulation reqest buff before	*/
+						/* cmd exec.			*/
+#define VSCSIIF_NEED_EMULATE_RSPBUF	0x04	/* If this bit is set, need	*/
+						/* emulation resp buff after	*/
+						/* cmd exec.			*/
+
+/* Additional Sense Code (ASC) used */
+#define NO_ADDITIONAL_SENSE		0x0
+#define LOGICAL_UNIT_NOT_READY		0x4
+#define UNRECOVERED_READ_ERR		0x11
+#define PARAMETER_LIST_LENGTH_ERR	0x1a
+#define INVALID_OPCODE			0x20
+#define ADDR_OUT_OF_RANGE		0x21
+#define INVALID_FIELD_IN_CDB		0x24
+#define INVALID_FIELD_IN_PARAM_LIST	0x26
+#define POWERON_RESET			0x29
+#define SAVING_PARAMS_UNSUP		0x39
+#define THRESHOLD_EXCEEDED		0x5d
+#define LOW_POWER_COND_ON		0x5e
+
+
+
+/* Number os SCSI op_code	*/
+#define VSCSI_MAX_SCSI_OP_CODE		256
+static unsigned char bitmap[VSCSI_MAX_SCSI_OP_CODE];
+
+#define NO_EMULATE(cmd) \
+	bitmap[cmd] = VSCSIIF_NEED_CMD_EXEC; \
+	pre_function[cmd] = NULL; \
+	post_function[cmd] = NULL
+
+
+
+/*
+  Emulation routines for each SCSI op_code.
+*/
+static void (*pre_function[VSCSI_MAX_SCSI_OP_CODE])(pending_req_t *, void *);
+static void (*post_function[VSCSI_MAX_SCSI_OP_CODE])(pending_req_t *, void *);
+
+
+static const int check_condition_result =
+		(DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
+
+static void scsiback_mk_sense_buffer(uint8_t *data, uint8_t key,
+			uint8_t asc, uint8_t asq)
+{
+	data[0] = 0x70;  /* fixed, current */
+	data[2] = key;
+	data[7] = 0xa;	  /* implies 18 byte sense buffer */
+	data[12] = asc;
+	data[13] = asq;
+}
+
+static void resp_not_supported_cmd(pending_req_t *pending_req, void *data)
+{
+	scsiback_mk_sense_buffer(pending_req->sense_buffer, ILLEGAL_REQUEST,
+		INVALID_OPCODE, 0);
+	pending_req->resid = 0;
+	pending_req->rslt  = check_condition_result;
+}
+
+
+static int __copy_to_sg(struct scatterlist *sgl, unsigned int nr_sg,
+	       void *buf, unsigned int buflen)
+{
+	struct scatterlist *sg;
+	void *from = buf;
+	void *to;
+	unsigned int from_rest = buflen;
+	unsigned int to_capa;
+	unsigned int copy_size = 0;
+	unsigned int i;
+	unsigned long pfn;
+
+	for_each_sg (sgl, sg, nr_sg, i) {
+		if (sg_page(sg) == NULL) {
+			pr_warning("%s: inconsistent length field in "
+				   "scatterlist\n", __FUNCTION__);
+			return -ENOMEM;
+		}
+
+		to_capa  = sg->length;
+		copy_size = min_t(unsigned int, to_capa, from_rest);
+
+		pfn = page_to_pfn(sg_page(sg));
+		to = pfn_to_kaddr(pfn) + (sg->offset);
+		memcpy(to, from, copy_size);
+
+		from_rest  -= copy_size;
+		if (from_rest == 0) {
+			return 0;
+		}
+		
+		from += copy_size;
+	}
+
+	pr_warning("%s: no space in scatterlist\n", __FUNCTION__);
+	return -ENOMEM;
+}
+
+static int __maybe_unused __copy_from_sg(struct scatterlist *sgl,
+					 unsigned int nr_sg, void *buf,
+					 unsigned int buflen)
+{
+	struct scatterlist *sg;
+	void *from;
+	void *to = buf;
+	unsigned int from_rest;
+	unsigned int to_capa = buflen;
+	unsigned int copy_size;
+	unsigned int i;
+	unsigned long pfn;
+
+	for_each_sg (sgl, sg, nr_sg, i) {
+		if (sg_page(sg) == NULL) {
+			pr_warning("%s: inconsistent length field in "
+				   "scatterlist\n", __FUNCTION__);
+			return -ENOMEM;
+		}
+
+		from_rest = sg->length;
+		if ((from_rest > 0) && (to_capa < from_rest)) {
+			pr_warning("%s: no space in destination buffer\n",
+				   __FUNCTION__);
+			return -ENOMEM;
+		}
+		copy_size = from_rest;
+
+		pfn = page_to_pfn(sg_page(sg));
+		from = pfn_to_kaddr(pfn) + (sg->offset);
+		memcpy(to, from, copy_size);
+
+		to_capa  -= copy_size;
+		to += copy_size;
+	}
+
+	return 0;
+}
+
+static int __nr_luns_under_host(struct vscsibk_info *info)
+{
+	struct v2p_entry *entry;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+	int lun_cnt = 0;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+	list_for_each_entry(entry, head, l) {
+			lun_cnt++;
+	}
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+
+	return (lun_cnt);
+}
+
+
+/* REPORT LUNS Define*/
+#define VSCSI_REPORT_LUNS_HEADER	8
+#define VSCSI_REPORT_LUNS_RETRY		3
+
+/* quoted scsi_debug.c/resp_report_luns() */
+static void __report_luns(pending_req_t *pending_req, void *data)
+{
+	struct vscsibk_info *info   = pending_req->info;
+	unsigned int        channel = pending_req->v_chn;
+	unsigned int        target  = pending_req->v_tgt;
+	unsigned int        nr_seg  = pending_req->nr_segments;
+	unsigned char *cmd = (unsigned char *)pending_req->cmnd;
+	
+	unsigned char *buff = NULL;
+	unsigned char alloc_len;
+	unsigned int alloc_luns = 0;
+	unsigned int req_bufflen = 0;
+	unsigned int actual_len = 0;
+	unsigned int retry_cnt = 0;
+	int select_report = (int)cmd[2];
+	int i, lun_cnt = 0, lun, upper, err = 0;
+	
+	struct v2p_entry *entry;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+	
+	struct scsi_lun *one_lun;
+
+	req_bufflen = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
+	if ((req_bufflen < 4) || (select_report != 0))
+		goto fail;
+
+	alloc_luns = __nr_luns_under_host(info);
+	alloc_len  = sizeof(struct scsi_lun) * alloc_luns
+				+ VSCSI_REPORT_LUNS_HEADER;
+retry:
+	if ((buff = kzalloc(alloc_len, GFP_KERNEL)) == NULL) {
+		pr_err("scsiback:%s kmalloc err\n", __FUNCTION__);
+		goto fail;
+	}
+
+	one_lun = (struct scsi_lun *) &buff[8];
+	spin_lock_irqsave(&info->v2p_lock, flags);
+	list_for_each_entry(entry, head, l) {
+		if ((entry->v.chn == channel) &&
+		    (entry->v.tgt == target)) {
+			
+			/* check overflow */
+			if (lun_cnt >= alloc_luns) {
+				spin_unlock_irqrestore(&info->v2p_lock,
+							flags);
+
+				if (retry_cnt < VSCSI_REPORT_LUNS_RETRY) {
+					retry_cnt++;
+					if (buff)
+						kfree(buff);
+					goto retry;
+				}
+
+				goto fail;
+			}
+
+			lun = entry->v.lun;
+			upper = (lun >> 8) & 0x3f;
+			if (upper)
+				one_lun[lun_cnt].scsi_lun[0] = upper;
+			one_lun[lun_cnt].scsi_lun[1] = lun & 0xff;
+			lun_cnt++;
+		}
+	}
+
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+
+	buff[2] = ((sizeof(struct scsi_lun) * lun_cnt) >> 8) & 0xff;
+	buff[3] = (sizeof(struct scsi_lun) * lun_cnt) & 0xff;
+
+	actual_len = lun_cnt * sizeof(struct scsi_lun) 
+				+ VSCSI_REPORT_LUNS_HEADER;
+	req_bufflen = 0;
+	for (i = 0; i < nr_seg; i++)
+		req_bufflen += pending_req->sgl[i].length;
+
+	err = __copy_to_sg(pending_req->sgl, nr_seg, buff, 
+				min(req_bufflen, actual_len));
+	if (err)
+		goto fail;
+
+	memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE);
+	pending_req->rslt = 0x00;
+	pending_req->resid = req_bufflen - min(req_bufflen, actual_len);
+
+	kfree(buff);
+	return;
+
+fail:
+	scsiback_mk_sense_buffer(pending_req->sense_buffer, ILLEGAL_REQUEST,
+		INVALID_FIELD_IN_CDB, 0);
+	pending_req->rslt  = check_condition_result;
+	pending_req->resid = 0;
+	if (buff)
+		kfree(buff);
+	return;
+}
+
+
+
+int __pre_do_emulation(pending_req_t *pending_req, void *data)
+{
+	uint8_t op_code = pending_req->cmnd[0];
+
+	if ((bitmap[op_code] & VSCSIIF_NEED_EMULATE_REQBUF) &&
+	    pre_function[op_code] != NULL) {
+		pre_function[op_code](pending_req, data);
+	}
+
+	/*
+	    0: no need for native driver call, so should return immediately.
+	    1: non emulation or should call native driver 
+	       after modifing the request buffer.
+	*/
+	return !!(bitmap[op_code] & VSCSIIF_NEED_CMD_EXEC);
+}
+
+void scsiback_rsp_emulation(pending_req_t *pending_req)
+{
+	uint8_t op_code = pending_req->cmnd[0];
+
+	if ((bitmap[op_code] & VSCSIIF_NEED_EMULATE_RSPBUF) &&
+	    post_function[op_code] != NULL) {
+		post_function[op_code](pending_req, NULL);
+	}
+
+	return;
+}
+
+
+void scsiback_req_emulation_or_cmdexec(pending_req_t *pending_req)
+{
+	if (__pre_do_emulation(pending_req, NULL)) {
+		scsiback_cmd_exec(pending_req);
+	}
+	else {
+		scsiback_fast_flush_area(pending_req);
+		scsiback_do_resp_with_sense(pending_req->sense_buffer,
+		  pending_req->rslt, pending_req->resid, pending_req);
+	}
+}
+
+
+/*
+  Following are not customizable functions.
+*/
+void scsiback_emulation_init(void)
+{
+	int i;
+
+	/* Initialize to default state */
+	for (i = 0; i < VSCSI_MAX_SCSI_OP_CODE; i++) {
+		bitmap[i]        = (VSCSIIF_NEED_EMULATE_REQBUF | 
+					VSCSIIF_NEED_EMULATE_RSPBUF);
+		pre_function[i]  = resp_not_supported_cmd;
+		post_function[i] = NULL;
+		/* means,
+		   - no need for pre-emulation
+		   - no need for post-emulation
+		   - call native driver
+		*/
+	}
+
+	/*
+	  Register appropriate functions below as you need.
+	  (See scsi/scsi.h for definition of SCSI op_code.)
+	*/
+
+	/*
+	  Following commands do not require emulation.
+	*/
+	NO_EMULATE(TEST_UNIT_READY);       /*0x00*/ /* sd,st */
+	NO_EMULATE(REZERO_UNIT);           /*0x01*/ /* st */
+	NO_EMULATE(REQUEST_SENSE);         /*0x03*/
+	NO_EMULATE(FORMAT_UNIT);           /*0x04*/
+	NO_EMULATE(READ_BLOCK_LIMITS);     /*0x05*/ /* st */
+	/*NO_EMULATE(REASSIGN_BLOCKS);       *//*0x07*/
+	NO_EMULATE(INITIALIZE_ELEMENT_STATUS); /*0x07*/ /* ch */
+	NO_EMULATE(READ_6);                /*0x08*/ /* sd,st */
+	NO_EMULATE(WRITE_6);               /*0x0a*/ /* sd,st */
+	NO_EMULATE(SEEK_6);                /*0x0b*/
+	/*NO_EMULATE(READ_REVERSE);          *//*0x0f*/
+	NO_EMULATE(WRITE_FILEMARKS);       /*0x10*/ /* st */
+	NO_EMULATE(SPACE);                 /*0x11*/ /* st */
+	NO_EMULATE(INQUIRY);               /*0x12*/
+	/*NO_EMULATE(RECOVER_BUFFERED_DATA); *//*0x14*/
+	NO_EMULATE(MODE_SELECT);           /*0x15*/ /* st */
+	NO_EMULATE(RESERVE);               /*0x16*/
+	NO_EMULATE(RELEASE);               /*0x17*/
+	/*NO_EMULATE(COPY);                  *//*0x18*/
+	NO_EMULATE(ERASE);                 /*0x19*/ /* st */
+	NO_EMULATE(MODE_SENSE);            /*0x1a*/ /* st */
+	NO_EMULATE(START_STOP);            /*0x1b*/ /* sd,st */
+	NO_EMULATE(RECEIVE_DIAGNOSTIC);    /*0x1c*/
+	NO_EMULATE(SEND_DIAGNOSTIC);       /*0x1d*/
+	NO_EMULATE(ALLOW_MEDIUM_REMOVAL);  /*0x1e*/
+
+	/*NO_EMULATE(SET_WINDOW);            *//*0x24*/
+	NO_EMULATE(READ_CAPACITY);         /*0x25*/ /* sd */
+	NO_EMULATE(READ_10);               /*0x28*/ /* sd */
+	NO_EMULATE(WRITE_10);              /*0x2a*/ /* sd */
+	NO_EMULATE(SEEK_10);               /*0x2b*/ /* st */
+	NO_EMULATE(POSITION_TO_ELEMENT);   /*0x2b*/ /* ch */
+	/*NO_EMULATE(WRITE_VERIFY);          *//*0x2e*/
+	/*NO_EMULATE(VERIFY);                *//*0x2f*/
+	/*NO_EMULATE(SEARCH_HIGH);           *//*0x30*/
+	/*NO_EMULATE(SEARCH_EQUAL);          *//*0x31*/
+	/*NO_EMULATE(SEARCH_LOW);            *//*0x32*/
+	NO_EMULATE(SET_LIMITS);            /*0x33*/
+	NO_EMULATE(PRE_FETCH);             /*0x34*/ /* st! */
+	NO_EMULATE(READ_POSITION);          /*0x34*/ /* st */
+	NO_EMULATE(SYNCHRONIZE_CACHE);      /*0x35*/ /* sd */
+	NO_EMULATE(LOCK_UNLOCK_CACHE);     /*0x36*/
+	NO_EMULATE(READ_DEFECT_DATA);      /*0x37*/
+	NO_EMULATE(MEDIUM_SCAN);           /*0x38*/
+	/*NO_EMULATE(COMPARE);               *//*0x39*/
+	/*NO_EMULATE(COPY_VERIFY);           *//*0x3a*/
+	NO_EMULATE(WRITE_BUFFER);          /*0x3b*/
+	NO_EMULATE(READ_BUFFER);           /*0x3c*/ /* osst */
+	/*NO_EMULATE(UPDATE_BLOCK);          *//*0x3d*/
+	/*NO_EMULATE(READ_LONG);             *//*0x3e*/
+	/*NO_EMULATE(WRITE_LONG);            *//*0x3f*/
+	/*NO_EMULATE(CHANGE_DEFINITION);     *//*0x40*/
+	/*NO_EMULATE(WRITE_SAME);            *//*0x41*/
+	NO_EMULATE(READ_TOC);              /*0x43*/ /* sr */
+	NO_EMULATE(LOG_SELECT);            /*0x4c*/
+	NO_EMULATE(LOG_SENSE);             /*0x4d*/ /* st! */
+	/*NO_EMULATE(MODE_SELECT_10);        *//*0x55*/
+	/*NO_EMULATE(RESERVE_10);            *//*0x56*/
+	/*NO_EMULATE(RELEASE_10);            *//*0x57*/
+	NO_EMULATE(MODE_SENSE_10);         /*0x5a*/ /* scsi_lib */
+	/*NO_EMULATE(PERSISTENT_RESERVE_IN); *//*0x5e*/
+	/*NO_EMULATE(PERSISTENT_RESERVE_OUT); *//*0x5f*/
+	/*           REPORT_LUNS             *//*0xa0*//*Full emulaiton*/
+#ifdef MAINTENANCE_IN
+	NO_EMULATE(MAINTENANCE_IN);           /*0xa3*/ /* IFT alua */
+	NO_EMULATE(MAINTENANCE_OUT);       /*0xa4*/ /* IFT alua */
+#endif
+	NO_EMULATE(MOVE_MEDIUM);           /*0xa5*/ /* ch */
+	NO_EMULATE(EXCHANGE_MEDIUM);       /*0xa6*/ /* ch */
+	/*NO_EMULATE(READ_12);               *//*0xa8*/
+	/*NO_EMULATE(WRITE_12);              *//*0xaa*/
+	/*NO_EMULATE(WRITE_VERIFY_12);       *//*0xae*/
+	/*NO_EMULATE(SEARCH_HIGH_12);        *//*0xb0*/
+	/*NO_EMULATE(SEARCH_EQUAL_12);       *//*0xb1*/
+	/*NO_EMULATE(SEARCH_LOW_12);         *//*0xb2*/
+	NO_EMULATE(READ_ELEMENT_STATUS);   /*0xb8*/ /* ch */
+	NO_EMULATE(SEND_VOLUME_TAG);       /*0xb6*/ /* ch */
+	/*NO_EMULATE(WRITE_LONG_2);          *//*0xea*/
+	NO_EMULATE(READ_16);               /*0x88*/ /* sd >2TB */
+	NO_EMULATE(WRITE_16);              /*0x8a*/ /* sd >2TB */
+	NO_EMULATE(VERIFY_16);	           /*0x8f*/
+	NO_EMULATE(SERVICE_ACTION_IN);     /*0x9e*/ /* sd >2TB */
+
+/* st: QFA_REQUEST_BLOCK, QFA_SEEK_BLOCK might be needed ? */
+	/*
+	  Following commands require emulation.
+	*/
+	pre_function[REPORT_LUNS] = __report_luns;
+	bitmap[REPORT_LUNS] = (VSCSIIF_NEED_EMULATE_REQBUF | 
+					VSCSIIF_NEED_EMULATE_RSPBUF);
+
+	return;
+}
diff --git a/drivers/xen/scsiback/interface.c b/drivers/xen/scsiback/interface.c
new file mode 100644
index 0000000..9098a3c
--- /dev/null
+++ b/drivers/xen/scsiback/interface.c
@@ -0,0 +1,141 @@
+/*
+ * interface management.
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include "common.h"
+
+#include <xen/evtchn.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+
+
+static struct kmem_cache *scsiback_cachep;
+
+struct vscsibk_info *vscsibk_info_alloc(domid_t domid)
+{
+	struct vscsibk_info *info;
+
+	info = kmem_cache_zalloc(scsiback_cachep, GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	info->domid = domid;
+	spin_lock_init(&info->ring_lock);
+	atomic_set(&info->nr_unreplied_reqs, 0);
+	init_waitqueue_head(&info->wq);
+	init_waitqueue_head(&info->waiting_to_free);
+
+	return info;
+}
+
+int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref,
+			evtchn_port_t evtchn)
+{
+	struct vm_struct *area;
+	struct vscsiif_sring *sring;
+	int err;
+
+	if (info->irq) {
+		pr_err("scsiback: Already connected through?\n");
+		return -1;
+	}
+
+	area = xenbus_map_ring_valloc(info->dev, ring_ref);
+	if (IS_ERR(area))
+		return PTR_ERR(area);
+	info->ring_area = area;
+
+	sring = (struct vscsiif_sring *)area->addr;
+	BACK_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+			info->domid, evtchn,
+			scsiback_intr, 0, "vscsiif-backend", info);
+
+	if (err < 0)
+		goto unmap_page;
+		
+	info->irq = err;
+
+	return 0;
+
+unmap_page:
+	xenbus_unmap_ring_vfree(info->dev, area);
+
+	return err;
+}
+
+void scsiback_disconnect(struct vscsibk_info *info)
+{
+	if (info->kthread) {
+		kthread_stop(info->kthread);
+		info->kthread = NULL;
+	}
+
+	wait_event(info->waiting_to_free, 
+		atomic_read(&info->nr_unreplied_reqs) == 0);
+
+	if (info->irq) {
+		unbind_from_irqhandler(info->irq, info);
+		info->irq = 0;
+	}
+
+	if (info->ring.sring) {
+		xenbus_unmap_ring_vfree(info->dev, info->ring_area);
+		info->ring.sring = NULL;
+	}
+}
+
+void scsiback_free(struct vscsibk_info *info)
+{
+	kmem_cache_free(scsiback_cachep, info);
+}
+
+int __init scsiback_interface_init(void)
+{
+	scsiback_cachep = kmem_cache_create("vscsiif_cache",
+		sizeof(struct vscsibk_info), 0, 0, NULL);
+	if (!scsiback_cachep) {
+		pr_err("scsiback: can't init scsi cache\n");
+		return -ENOMEM;
+	}
+	
+	return 0;
+}
+
+void scsiback_interface_exit(void)
+{
+	kmem_cache_destroy(scsiback_cachep);
+}
diff --git a/drivers/xen/scsiback/scsiback.c b/drivers/xen/scsiback/scsiback.c
new file mode 100644
index 0000000..726723d
--- /dev/null
+++ b/drivers/xen/scsiback/scsiback.c
@@ -0,0 +1,730 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_eh.h>
+
+#include "common.h"
+
+
+struct list_head pending_free;
+DEFINE_SPINLOCK(pending_free_lock);
+DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+int vscsiif_reqs = VSCSIIF_BACK_MAX_PENDING_REQS;
+module_param_named(reqs, vscsiif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of scsiback requests to allocate");
+
+static unsigned int log_print_stat = 0;
+module_param(log_print_stat, int, 0644);
+
+#define SCSIBACK_INVALID_HANDLE (~0)
+
+static pending_req_t *pending_reqs;
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - pending_reqs) * VSCSIIF_SG_TABLESIZE + seg;
+}
+
+static unsigned long vaddr(pending_req_t *req, int seg)
+{
+	unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+void scsiback_fast_flush_area(pending_req_t *req)
+{
+	struct gnttab_unmap_grant_ref unmap[VSCSIIF_SG_TABLESIZE];
+	unsigned int i, invcount = 0;
+	grant_handle_t handle;
+	int err;
+
+	if (req->nr_segments) {
+		for (i = 0; i < req->nr_segments; i++) {
+			handle = pending_handle(req, i);
+			if (handle == SCSIBACK_INVALID_HANDLE)
+				continue;
+			gnttab_set_unmap_op(&unmap[i], vaddr(req, i),
+						GNTMAP_host_map, handle);
+			pending_handle(req, i) = SCSIBACK_INVALID_HANDLE;
+			invcount++;
+		}
+
+		err = HYPERVISOR_grant_table_op(
+			GNTTABOP_unmap_grant_ref, unmap, invcount);
+		BUG_ON(err);
+		kfree(req->sgl);
+	}
+
+	return;
+}
+
+
+static pending_req_t * alloc_req(struct vscsibk_info *info)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	return req;
+}
+
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+
+static void scsiback_notify_work(struct vscsibk_info *info)
+{
+	info->waiting_reqs = 1;
+	wake_up(&info->wq);
+}
+
+void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result,
+			uint32_t resid, pending_req_t *pending_req)
+{
+	vscsiif_response_t *ring_res;
+	struct vscsibk_info *info = pending_req->info;
+	int notify;
+	int more_to_do = 1;
+	struct scsi_sense_hdr sshdr;
+	unsigned long flags;
+
+	DPRINTK("%s\n",__FUNCTION__);
+
+	spin_lock_irqsave(&info->ring_lock, flags);
+
+	ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt);
+	info->ring.rsp_prod_pvt++;
+
+	ring_res->rslt   = result;
+	ring_res->rqid   = pending_req->rqid;
+
+	if (sense_buffer != NULL) {
+		if (scsi_normalize_sense(sense_buffer,
+			sizeof(sense_buffer), &sshdr)) {
+
+			int len = 8 + sense_buffer[7];
+
+			if (len > VSCSIIF_SENSE_BUFFERSIZE)
+				len = VSCSIIF_SENSE_BUFFERSIZE;
+
+			memcpy(ring_res->sense_buffer, sense_buffer, len);
+			ring_res->sense_len = len;
+		}
+	} else {
+		ring_res->sense_len = 0;
+	}
+
+	ring_res->residual_len = resid;
+
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify);
+	if (info->ring.rsp_prod_pvt == info->ring.req_cons) {
+		RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do);
+	} else if (RING_HAS_UNCONSUMED_REQUESTS(&info->ring)) {
+		more_to_do = 1;
+	}
+	
+	spin_unlock_irqrestore(&info->ring_lock, flags);
+
+	if (more_to_do)
+		scsiback_notify_work(info);
+
+	if (notify)
+		notify_remote_via_irq(info->irq);
+
+	free_req(pending_req);
+}
+
+static void scsiback_print_status(char *sense_buffer, int errors,
+					pending_req_t *pending_req)
+{
+	struct scsi_device *sdev = pending_req->sdev;
+	
+	pr_err("scsiback: %d:%d:%d:%d ",
+	       sdev->host->host_no, sdev->channel, sdev->id, sdev->lun);
+	pr_err("status = 0x%02x, message = 0x%02x, host = 0x%02x,"
+	       " driver = 0x%02x\n",
+	       status_byte(errors), msg_byte(errors),
+	       host_byte(errors), driver_byte(errors));
+
+	pr_err("scsiback: cmnd[0]=0x%02X\n", pending_req->cmnd[0]);
+
+	if (CHECK_CONDITION & status_byte(errors))
+		__scsi_print_sense("scsiback", sense_buffer, SCSI_SENSE_BUFFERSIZE);
+}
+
+
+static void scsiback_cmd_done(struct request *req, int uptodate)
+{
+	pending_req_t *pending_req = req->end_io_data;
+	unsigned char *sense_buffer;
+	unsigned int resid;
+	int errors;
+
+	sense_buffer = req->sense;
+	resid        = blk_rq_bytes(req);
+	errors       = req->errors;
+
+	if (errors != 0) {
+		if (log_print_stat)
+			scsiback_print_status(sense_buffer, errors, pending_req);
+	}
+
+	/* The Host mode is through as for Emulation. */
+	if (pending_req->info->feature != VSCSI_TYPE_HOST)
+		scsiback_rsp_emulation(pending_req);
+
+	scsiback_fast_flush_area(pending_req);
+	scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req);
+	scsiback_put(pending_req->info);
+
+	__blk_put_request(req->q, req);
+}
+
+
+static int scsiback_gnttab_data_map(vscsiif_request_t *ring_req,
+					pending_req_t *pending_req)
+{
+	u32 flags;
+	int write;
+	int i, err = 0;
+	unsigned int data_len = 0;
+	struct gnttab_map_grant_ref map[VSCSIIF_SG_TABLESIZE];
+	struct vscsibk_info *info   = pending_req->info;
+
+	int data_dir = (int)pending_req->sc_data_direction;
+	unsigned int nr_segments = (unsigned int)pending_req->nr_segments;
+
+	write = (data_dir == DMA_TO_DEVICE);
+
+	if (nr_segments) {
+		struct scatterlist *sg;
+
+		/* free of (sgl) in fast_flush_area()*/
+		pending_req->sgl = kmalloc(sizeof(struct scatterlist) * nr_segments,
+						GFP_KERNEL);
+		if (!pending_req->sgl) {
+			pr_err("scsiback: %s: kmalloc() error\n", __FUNCTION__);
+			return -ENOMEM;
+		}
+
+		sg_init_table(pending_req->sgl, nr_segments);
+
+		flags = GNTMAP_host_map;
+		if (write)
+			flags |= GNTMAP_readonly;
+
+		for (i = 0; i < nr_segments; i++)
+			gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+						ring_req->seg[i].gref,
+						info->domid);
+
+		err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nr_segments);
+		BUG_ON(err);
+
+		for_each_sg (pending_req->sgl, sg, nr_segments, i) {
+			struct page *pg;
+
+			/* Retry maps with GNTST_eagain */
+			if (unlikely(map[i].status == GNTST_eagain))
+				gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
+			if (unlikely(map[i].status != GNTST_okay)) {
+				pr_err("scsiback: invalid buffer -- could not remap it\n");
+				map[i].handle = SCSIBACK_INVALID_HANDLE;
+				err |= 1;
+			}
+
+			pending_handle(pending_req, i) = map[i].handle;
+
+			if (err)
+				continue;
+
+			pg = pending_pages[vaddr_pagenr(pending_req, i)];
+
+			set_phys_to_machine(page_to_pfn(pg),
+				FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+
+			sg_set_page(sg, pg, ring_req->seg[i].length,
+				    ring_req->seg[i].offset);
+			data_len += sg->length;
+
+			barrier();
+			if (sg->offset >= PAGE_SIZE ||
+			    sg->length > PAGE_SIZE ||
+			    sg->offset + sg->length > PAGE_SIZE)
+				err |= 1;
+
+		}
+
+		if (err)
+			goto fail_flush;
+	}
+	
+	pending_req->request_bufflen = data_len;
+	
+	return 0;
+	
+fail_flush:
+	scsiback_fast_flush_area(pending_req);
+	return -ENOMEM;
+}
+
+/* quoted scsi_lib.c/scsi_bi_endio */
+static void scsiback_bi_endio(struct bio *bio, int error)
+{
+	bio_put(bio);
+}
+
+
+
+/* quoted scsi_lib.c/scsi_req_map_sg . */
+static struct bio *request_map_sg(pending_req_t *pending_req)
+{
+	struct request_queue *q = pending_req->sdev->request_queue;
+	unsigned int nsegs = (unsigned int)pending_req->nr_segments;
+	unsigned int i, len, bytes, off, nr_pages, nr_vecs = 0;
+	struct scatterlist *sg;
+	struct page *page;
+	struct bio *bio = NULL, *bio_first = NULL, *bio_last = NULL;
+	int err;
+
+	for_each_sg (pending_req->sgl, sg, nsegs, i) {
+		page = sg_page(sg);
+		off = sg->offset;
+		len = sg->length;
+
+		nr_pages = (len + off + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		while (len > 0) {
+			bytes = min_t(unsigned int, len, PAGE_SIZE - off);
+
+			if (!bio) {
+				nr_vecs = min_t(unsigned int, BIO_MAX_PAGES,
+					 	nr_pages);
+				nr_pages -= nr_vecs;
+				bio = bio_alloc(GFP_KERNEL, nr_vecs);
+				if (!bio) {
+					err = -ENOMEM;
+					goto free_bios;
+				}
+				bio->bi_end_io = scsiback_bi_endio;
+				if (bio_last)
+					bio_last->bi_next = bio;
+				else
+					bio_first = bio;
+				bio_last = bio;
+			}
+
+			if (bio_add_pc_page(q, bio, page, bytes, off) !=
+						bytes) {
+				bio_put(bio);
+				err = -EINVAL;
+				goto free_bios;
+			}
+
+			if (bio->bi_vcnt >= nr_vecs) {
+				bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+				if (pending_req->sc_data_direction == WRITE)
+					bio->bi_rw |= REQ_WRITE;
+				bio = NULL;
+			}
+
+			page++;
+			len -= bytes;
+			off = 0;
+		}
+	}
+
+	return bio_first;
+
+free_bios:
+	while ((bio = bio_first) != NULL) {
+		bio_first = bio->bi_next;
+		bio_put(bio);
+	}
+
+	return ERR_PTR(err);
+}
+
+
+void scsiback_cmd_exec(pending_req_t *pending_req)
+{
+	int cmd_len  = (int)pending_req->cmd_len;
+	int data_dir = (int)pending_req->sc_data_direction;
+	unsigned int timeout;
+	struct request *rq;
+	int write;
+
+	DPRINTK("%s\n",__FUNCTION__);
+
+	/* because it doesn't timeout backend earlier than frontend.*/
+	if (pending_req->timeout_per_command)
+		timeout = pending_req->timeout_per_command * HZ;
+	else
+		timeout = VSCSIIF_TIMEOUT;
+
+	write = (data_dir == DMA_TO_DEVICE);
+	if (pending_req->nr_segments) {
+		struct bio *bio = request_map_sg(pending_req);
+
+		if (IS_ERR(bio)) {
+			pr_err("scsiback: SG Request Map Error\n");
+			return;
+		}
+
+		rq = blk_make_request(pending_req->sdev->request_queue, bio,
+				      GFP_KERNEL);
+		if (IS_ERR(rq)) {
+			pr_err("scsiback: Make Request Error\n");
+			return;
+		}
+
+		rq->buffer = NULL;
+	} else {
+		rq = blk_get_request(pending_req->sdev->request_queue, write,
+				     GFP_KERNEL);
+		if (unlikely(!rq)) {
+			pr_err("scsiback: Get Request Error\n");
+			return;
+		}
+	}
+
+	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+	rq->cmd_len = cmd_len;
+	memcpy(rq->cmd, pending_req->cmnd, cmd_len);
+
+	memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE);
+	rq->sense       = pending_req->sense_buffer;
+	rq->sense_len = 0;
+
+	/* not allowed to retry in backend.                   */
+	rq->retries   = 0;
+	rq->timeout   = timeout;
+	rq->end_io_data = pending_req;
+
+	scsiback_get(pending_req->info);
+	blk_execute_rq_nowait(rq->q, NULL, rq, 1, scsiback_cmd_done);
+
+	return ;
+}
+
+
+static void scsiback_device_reset_exec(pending_req_t *pending_req)
+{
+	struct vscsibk_info *info = pending_req->info;
+	int err;
+	struct scsi_device *sdev = pending_req->sdev;
+
+	scsiback_get(info);
+	err = scsi_reset_provider(sdev, SCSI_TRY_RESET_DEVICE);
+
+	scsiback_do_resp_with_sense(NULL, err, 0, pending_req);
+	scsiback_put(info);
+
+	return;
+}
+
+
+irqreturn_t scsiback_intr(int irq, void *dev_id)
+{
+	scsiback_notify_work((struct vscsibk_info *)dev_id);
+	return IRQ_HANDLED;
+}
+
+static int prepare_pending_reqs(struct vscsibk_info *info,
+		vscsiif_request_t *ring_req, pending_req_t *pending_req)
+{
+	struct scsi_device *sdev;
+	struct ids_tuple vir;
+	int err = -EINVAL;
+
+	DPRINTK("%s\n",__FUNCTION__);
+
+	pending_req->rqid       = ring_req->rqid;
+	pending_req->act        = ring_req->act;
+
+	pending_req->info       = info;
+
+	pending_req->v_chn = vir.chn = ring_req->channel;
+	pending_req->v_tgt = vir.tgt = ring_req->id;
+	vir.lun = ring_req->lun;
+
+	rmb();
+	sdev = scsiback_do_translation(info, &vir);
+	if (!sdev) {
+		pending_req->sdev = NULL;
+		DPRINTK("scsiback: doesn't exist.\n");
+		err = -ENODEV;
+		goto invalid_value;
+	}
+	pending_req->sdev = sdev;
+
+	/* request range check from frontend */
+	pending_req->sc_data_direction = ring_req->sc_data_direction;
+	barrier();
+	if ((pending_req->sc_data_direction != DMA_BIDIRECTIONAL) &&
+		(pending_req->sc_data_direction != DMA_TO_DEVICE) &&
+		(pending_req->sc_data_direction != DMA_FROM_DEVICE) &&
+		(pending_req->sc_data_direction != DMA_NONE)) {
+		DPRINTK("scsiback: invalid parameter data_dir = %d\n",
+			pending_req->sc_data_direction);
+		err = -EINVAL;
+		goto invalid_value;
+	}
+
+	pending_req->nr_segments = ring_req->nr_segments;
+	barrier();
+	if (pending_req->nr_segments > VSCSIIF_SG_TABLESIZE) {
+		DPRINTK("scsiback: invalid parameter nr_seg = %d\n",
+			pending_req->nr_segments);
+		err = -EINVAL;
+		goto invalid_value;
+	}
+
+	pending_req->cmd_len = ring_req->cmd_len;
+	barrier();
+	if (pending_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) {
+		DPRINTK("scsiback: invalid parameter cmd_len = %d\n",
+			pending_req->cmd_len);
+		err = -EINVAL;
+		goto invalid_value;
+	}
+	memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len);
+	
+	pending_req->timeout_per_command = ring_req->timeout_per_command;
+
+	if(scsiback_gnttab_data_map(ring_req, pending_req)) {
+		DPRINTK("scsiback: invalid buffer\n");
+		err = -EINVAL;
+		goto invalid_value;
+	}
+
+	return 0;
+
+invalid_value:
+	return err;
+}
+
+
+static int scsiback_do_cmd_fn(struct vscsibk_info *info)
+{
+	struct vscsiif_back_ring *ring = &info->ring;
+	vscsiif_request_t  *ring_req;
+
+	pending_req_t *pending_req;
+	RING_IDX rc, rp;
+	int err, more_to_do = 0;
+
+	DPRINTK("%s\n",__FUNCTION__);
+
+	rc = ring->req_cons;
+	rp = ring->sring->req_prod;
+	rmb();
+
+	while ((rc != rp)) {
+		if (RING_REQUEST_CONS_OVERFLOW(ring, rc))
+			break;
+		pending_req = alloc_req(info);
+		if (NULL == pending_req) {
+			more_to_do = 1;
+			break;
+		}
+
+		ring_req = RING_GET_REQUEST(ring, rc);
+		ring->req_cons = ++rc;
+
+		err = prepare_pending_reqs(info, ring_req,
+						pending_req);
+		if (err == -EINVAL) {
+			scsiback_do_resp_with_sense(NULL, (DRIVER_ERROR << 24),
+				0, pending_req);
+			continue;
+		} else if (err == -ENODEV) {
+			scsiback_do_resp_with_sense(NULL, (DID_NO_CONNECT << 16),
+				0, pending_req);
+			continue;
+		}
+
+		if (pending_req->act == VSCSIIF_ACT_SCSI_CDB) {
+
+			/* The Host mode is through as for Emulation. */
+			if (info->feature == VSCSI_TYPE_HOST)
+				scsiback_cmd_exec(pending_req);
+			else
+				scsiback_req_emulation_or_cmdexec(pending_req);
+
+		} else if (pending_req->act == VSCSIIF_ACT_SCSI_RESET) {
+			scsiback_device_reset_exec(pending_req);
+		} else {
+			pr_err("scsiback: invalid parameter for request\n");
+			scsiback_do_resp_with_sense(NULL, (DRIVER_ERROR << 24),
+				0, pending_req);
+			continue;
+		}
+	}
+
+	if (RING_HAS_UNCONSUMED_REQUESTS(ring))
+		more_to_do = 1;
+
+	/* Yield point for this unbounded loop. */
+	cond_resched();
+
+	return more_to_do;
+}
+
+
+int scsiback_schedule(void *data)
+{
+	struct vscsibk_info *info = (struct vscsibk_info *)data;
+
+	DPRINTK("%s\n",__FUNCTION__);
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(
+			info->wq,
+			info->waiting_reqs || kthread_should_stop());
+		wait_event_interruptible(
+			pending_free_wq,
+			!list_empty(&pending_free) || kthread_should_stop());
+
+		info->waiting_reqs = 0;
+		smp_mb();
+
+		if (scsiback_do_cmd_fn(info))
+			info->waiting_reqs = 1;
+	}
+
+	return 0;
+}
+
+
+static int __init scsiback_init(void)
+{
+	int i, mmap_pages;
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	mmap_pages = vscsiif_reqs * VSCSIIF_SG_TABLESIZE;
+
+	pending_reqs          = kzalloc(sizeof(pending_reqs[0]) *
+					vscsiif_reqs, GFP_KERNEL);
+	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+					mmap_pages, GFP_KERNEL);
+	pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
+
+	if (!pending_reqs || !pending_grant_handles || !pending_pages)
+		goto out_of_memory;
+
+	for (i = 0; i < mmap_pages; i++)
+		pending_grant_handles[i] = SCSIBACK_INVALID_HANDLE;
+
+	if (scsiback_interface_init() < 0)
+		goto out_of_memory;
+
+	INIT_LIST_HEAD(&pending_free);
+
+	for (i = 0; i < vscsiif_reqs; i++)
+		list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+	if (scsiback_xenbus_init())
+		goto out_interface;
+
+	scsiback_emulation_init();
+
+	return 0;
+
+out_interface:
+	scsiback_interface_exit();
+out_of_memory:
+	kfree(pending_reqs);
+	kfree(pending_grant_handles);
+	free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+	pr_err("scsiback: %s: out of memory\n", __FUNCTION__);
+	return -ENOMEM;
+}
+
+#if 0
+static void __exit scsiback_exit(void)
+{
+	scsiback_xenbus_unregister();
+	scsiback_interface_exit();
+	kfree(pending_reqs);
+	kfree(pending_grant_handles);
+	free_empty_pages_and_pagevec(pending_pages, (vscsiif_reqs * VSCSIIF_SG_TABLESIZE));
+
+}
+#endif
+
+module_init(scsiback_init);
+
+#if 0
+module_exit(scsiback_exit);
+#endif
+
+MODULE_DESCRIPTION("Xen SCSI backend driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vscsi");
diff --git a/drivers/xen/scsiback/translate.c b/drivers/xen/scsiback/translate.c
new file mode 100644
index 0000000..c82e5b8
--- /dev/null
+++ b/drivers/xen/scsiback/translate.c
@@ -0,0 +1,168 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/gfp.h>
+
+#include "common.h"
+
+/*
+  Initialize the translation entry list
+*/
+void scsiback_init_translation_table(struct vscsibk_info *info)
+{
+	INIT_LIST_HEAD(&info->v2p_entry_lists);
+	spin_lock_init(&info->v2p_lock);
+}
+
+
+/*
+  Add a new translation entry
+*/
+int scsiback_add_translation_entry(struct vscsibk_info *info,
+			struct scsi_device *sdev, struct ids_tuple *v)
+{
+	int err = 0;
+	struct v2p_entry *entry;
+	struct v2p_entry *new;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+	
+	spin_lock_irqsave(&info->v2p_lock, flags);
+
+	/* Check double assignment to identical virtual ID */
+	list_for_each_entry(entry, head, l) {
+		if ((entry->v.chn == v->chn) &&
+		    (entry->v.tgt == v->tgt) &&
+		    (entry->v.lun == v->lun)) {
+			pr_warning("scsiback: Virtual ID is already used. "
+				   "Assignment was not performed.\n");
+			err = -EEXIST;
+			goto out;
+		}
+
+	}
+
+	/* Create a new translation entry and add to the list */
+	if ((new = kmalloc(sizeof(struct v2p_entry), GFP_ATOMIC)) == NULL) {
+		pr_err("scsiback: %s: kmalloc() error\n", __FUNCTION__);
+		err = -ENOMEM;
+		goto out;
+	}
+	new->v = *v;
+	new->sdev = sdev;
+	list_add_tail(&new->l, head);
+
+out:	
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return err;
+}
+
+
+/*
+  Delete the translation entry specfied
+*/
+int scsiback_del_translation_entry(struct vscsibk_info *info,
+				struct ids_tuple *v)
+{
+	struct v2p_entry *entry;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+	/* Find out the translation entry specified */
+	list_for_each_entry(entry, head, l) {
+		if ((entry->v.chn == v->chn) &&
+		    (entry->v.tgt == v->tgt) &&
+		    (entry->v.lun == v->lun)) {
+			goto found;
+		}
+	}
+
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return 1;
+
+found:
+	/* Delete the translation entry specfied */
+	scsi_device_put(entry->sdev);
+	list_del(&entry->l);
+	kfree(entry);
+
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return 0;
+}
+
+
+/*
+  Perform virtual to physical translation
+*/
+struct scsi_device *scsiback_do_translation(struct vscsibk_info *info,
+			struct ids_tuple *v)
+{
+	struct v2p_entry *entry;
+	struct list_head *head = &(info->v2p_entry_lists);
+	struct scsi_device *sdev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+	list_for_each_entry(entry, head, l) {
+		if ((entry->v.chn == v->chn) &&
+		    (entry->v.tgt == v->tgt) &&
+		    (entry->v.lun == v->lun)) {
+			sdev = entry->sdev;
+			goto out;
+		}
+	}
+out:
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return sdev;
+}
+
+
+/*
+  Release the translation entry specfied
+*/
+void scsiback_release_translation_entry(struct vscsibk_info *info)
+{
+	struct v2p_entry *entry, *tmp;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+	list_for_each_entry_safe(entry, tmp, head, l) {
+		scsi_device_put(entry->sdev);
+		list_del(&entry->l);
+		kfree(entry);
+	}
+
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return;
+
+}
diff --git a/drivers/xen/scsiback/xenbus.c b/drivers/xen/scsiback/xenbus.c
new file mode 100644
index 0000000..55f3e5d
--- /dev/null
+++ b/drivers/xen/scsiback/xenbus.c
@@ -0,0 +1,375 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <linux/kthread.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+
+#include "common.h"
+
+struct backend_info
+{
+	struct xenbus_device *dev;
+	struct vscsibk_info *info;
+};
+
+
+static int __vscsiif_name(struct backend_info *be, char *buf)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned int domid, id;
+
+	sscanf(dev->nodename, "backend/vscsi/%u/%u", &domid, &id);
+	snprintf(buf, TASK_COMM_LEN, "vscsi.%u.%u", be->info->domid, id);
+
+	return 0;
+}
+
+static int scsiback_map(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned int ring_ref, evtchn;
+	int err;
+	char name[TASK_COMM_LEN];
+
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			"ring-ref", "%u", &ring_ref,
+			"event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend);
+		return err;
+	}
+
+	err = scsiback_init_sring(be->info, ring_ref, evtchn);
+	if (err)
+		return err;
+
+	err = __vscsiif_name(be, name);
+	if (err) {
+		xenbus_dev_error(dev, err, "get scsiback dev name");
+		return err;
+	}
+
+	be->info->kthread = kthread_run(scsiback_schedule, be->info, name);
+	if (IS_ERR(be->info->kthread)) {
+		err = PTR_ERR(be->info->kthread);
+		be->info->kthread = NULL;
+		xenbus_dev_error(be->dev, err, "start vscsiif");
+		return err;
+	}
+
+	return 0;
+}
+
+
+struct scsi_device *scsiback_get_scsi_device(struct ids_tuple *phy)
+{
+	struct Scsi_Host *shost;
+	struct scsi_device *sdev = NULL;
+
+	shost = scsi_host_lookup(phy->hst);
+	if (IS_ERR(shost)) {
+		pr_err("scsiback: host%d doesn't exist\n", phy->hst);
+		return NULL;
+	}
+	sdev   = scsi_device_lookup(shost, phy->chn, phy->tgt, phy->lun);
+	if (!sdev) {
+		pr_err("scsiback: %d:%d:%d:%d doesn't exist\n",
+		       phy->hst, phy->chn, phy->tgt, phy->lun);
+		scsi_host_put(shost);
+		return NULL;
+	}
+
+	scsi_host_put(shost);
+	return (sdev);
+}
+
+#define VSCSIBACK_OP_ADD_OR_DEL_LUN	1
+#define VSCSIBACK_OP_UPDATEDEV_STATE	2
+
+
+static void scsiback_do_lun_hotplug(struct backend_info *be, int op)
+{
+	int i, err = 0;
+	struct ids_tuple phy, vir;
+	int device_state;
+	char str[64], state_str[64];
+	char **dir;
+	unsigned int dir_n = 0;
+	struct xenbus_device *dev = be->dev;
+	struct scsi_device *sdev;
+
+	dir = xenbus_directory(XBT_NIL, dev->nodename, "vscsi-devs", &dir_n);
+	if (IS_ERR(dir))
+		return;
+
+	for (i = 0; i < dir_n; i++) {
+		
+		/* read status */
+		snprintf(state_str, sizeof(state_str), "vscsi-devs/%s/state", dir[i]);
+		err = xenbus_scanf(XBT_NIL, dev->nodename, state_str, "%u",
+			&device_state);
+		if (XENBUS_EXIST_ERR(err))
+			continue;
+
+		/* physical SCSI device */
+		snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", dir[i]);
+		err = xenbus_scanf(XBT_NIL, dev->nodename, str,
+			"%u:%u:%u:%u", &phy.hst, &phy.chn, &phy.tgt, &phy.lun);
+		if (XENBUS_EXIST_ERR(err)) {
+			xenbus_printf(XBT_NIL, dev->nodename, state_str,
+					"%d", XenbusStateClosed);
+			continue;
+		}
+
+		/* virtual SCSI device */
+		snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]);
+		err = xenbus_scanf(XBT_NIL, dev->nodename, str,
+			"%u:%u:%u:%u", &vir.hst, &vir.chn, &vir.tgt, &vir.lun);
+		if (XENBUS_EXIST_ERR(err)) {
+			xenbus_printf(XBT_NIL, dev->nodename, state_str,
+					"%d", XenbusStateClosed);
+			continue;
+		}
+
+		switch (op) {
+		case VSCSIBACK_OP_ADD_OR_DEL_LUN:
+			if (device_state == XenbusStateInitialising) {
+				sdev = scsiback_get_scsi_device(&phy);
+				if (!sdev)
+					xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+							    "%d", XenbusStateClosed);
+				else {
+					err = scsiback_add_translation_entry(be->info, sdev, &vir);
+					if (!err) {
+						if (xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+								    "%d", XenbusStateInitialised)) {
+							pr_err("scsiback: xenbus_printf error %s\n",
+							       state_str);
+							scsiback_del_translation_entry(be->info, &vir);
+						}
+					} else {
+						scsi_device_put(sdev);
+						xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+								    "%d", XenbusStateClosed);
+					}
+				}
+			}
+
+			if (device_state == XenbusStateClosing) {
+				if (!scsiback_del_translation_entry(be->info, &vir)) {
+					if (xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+							    "%d", XenbusStateClosed))
+						pr_err("scsiback: xenbus_printf error %s\n",
+						       state_str);
+				}
+			}
+			break;
+
+		case VSCSIBACK_OP_UPDATEDEV_STATE:
+			if (device_state == XenbusStateInitialised) {
+				/* modify vscsi-devs/dev-x/state */
+				if (xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+						    "%d", XenbusStateConnected)) {
+					pr_err("scsiback: xenbus_printf error %s\n",
+					       state_str);
+					scsiback_del_translation_entry(be->info, &vir);
+					xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+							    "%d", XenbusStateClosed);
+				}
+			}
+			break;
+		/*When it is necessary, processing is added here.*/
+		default:
+			break;
+		}
+	}
+
+	kfree(dir);
+	return ;
+}
+
+
+static void scsiback_frontend_changed(struct xenbus_device *dev,
+					enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	int err;
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		break;
+	case XenbusStateInitialised:
+		err = scsiback_map(be);
+		if (err)
+			break;
+
+		scsiback_do_lun_hotplug(be, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+		xenbus_switch_state(dev, XenbusStateConnected);
+
+		break;
+	case XenbusStateConnected:
+
+		scsiback_do_lun_hotplug(be, VSCSIBACK_OP_UPDATEDEV_STATE);
+
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		xenbus_switch_state(dev, XenbusStateConnected);
+
+		break;
+
+	case XenbusStateClosing:
+		scsiback_disconnect(be->info);
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		device_unregister(&dev->dev);
+		break;
+
+	case XenbusStateReconfiguring:
+		scsiback_do_lun_hotplug(be, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+
+		xenbus_switch_state(dev, XenbusStateReconfigured);
+
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+					frontend_state);
+		break;
+	}
+}
+
+
+static int scsiback_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	if (be->info) {
+		scsiback_disconnect(be->info);
+		scsiback_release_translation_entry(be->info);
+		scsiback_free(be->info);
+		be->info = NULL;
+	}
+
+	kfree(be);
+	dev_set_drvdata(&dev->dev, NULL);
+
+	return 0;
+}
+
+
+static int scsiback_probe(struct xenbus_device *dev,
+			   const struct xenbus_device_id *id)
+{
+	int err;
+	unsigned val = 0;
+
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+
+	DPRINTK("%p %d\n", dev, dev->otherend_id);
+
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+
+	be->info = vscsibk_info_alloc(dev->otherend_id);
+	if (IS_ERR(be->info)) {
+		err = PTR_ERR(be->info);
+		be->info = NULL;
+		xenbus_dev_fatal(dev, err, "creating scsihost interface");
+		goto fail;
+	}
+
+	be->info->dev = dev;
+	be->info->irq = 0;
+	be->info->feature = 0;	/*default not HOSTMODE.*/
+
+	scsiback_init_translation_table(be->info);
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename,
+				"feature-host", "%d", &val);
+	if (XENBUS_EXIST_ERR(err))
+		val = 0;
+
+	if (val)
+		be->info->feature = VSCSI_TYPE_HOST;
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+
+fail:
+	pr_warning("scsiback: %s failed\n",__FUNCTION__);
+	scsiback_remove(dev);
+
+	return err;
+}
+
+
+static const struct xenbus_device_id scsiback_ids[] = {
+	{ "vscsi" },
+	{ "" }
+};
+
+static DEFINE_XENBUS_DRIVER(scsiback, ,
+	.probe			= scsiback_probe,
+	.remove			= scsiback_remove,
+	.otherend_changed	= scsiback_frontend_changed
+);
+
+int __init scsiback_xenbus_init(void)
+{
+	return xenbus_register_backend(&scsiback_driver);
+}
+
+void __exit scsiback_xenbus_unregister(void)
+{
+	xenbus_unregister_driver(&scsiback_driver);
+}
diff --git a/drivers/xen/scsifront/Makefile b/drivers/xen/scsifront/Makefile
new file mode 100644
index 0000000..58ee185
--- /dev/null
+++ b/drivers/xen/scsifront/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_XEN_SCSI_FRONTEND)	:= xenscsi.o
+xenscsi-objs := scsifront.o xenbus.o
diff --git a/drivers/xen/scsifront/common.h b/drivers/xen/scsifront/common.h
new file mode 100644
index 0000000..1d42481
--- /dev/null
+++ b/drivers/xen/scsifront/common.h
@@ -0,0 +1,135 @@
+/*
+ * Xen SCSI frontend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_DRIVERS_SCSIFRONT_H__
+#define __XEN_DRIVERS_SCSIFRONT_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <xen/xenbus.h>
+#include <xen/gnttab.h>
+#include <xen/evtchn.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/vscsiif.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/protocols.h>
+#include <asm/delay.h>
+#include <asm/hypervisor.h>
+#include <asm/maddr.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define GRANT_INVALID_REF	0
+#define VSCSI_IN_ABORT		1
+#define VSCSI_IN_RESET		2
+
+/* tuning point*/
+#define VSCSIIF_DEFAULT_CMD_PER_LUN 10
+#define VSCSIIF_MAX_TARGET          64
+#define VSCSIIF_MAX_LUN             255
+
+#define VSCSIIF_RING_SIZE	__CONST_RING_SIZE(vscsiif, PAGE_SIZE)
+#define VSCSIIF_MAX_REQS	VSCSIIF_RING_SIZE
+
+struct vscsifrnt_shadow {
+	uint16_t next_free;
+	
+	/* command between backend and frontend
+	 * VSCSIIF_ACT_SCSI_CDB or VSCSIIF_ACT_SCSI_RESET */
+	unsigned char act;
+	
+	/* do reset function */
+	wait_queue_head_t wq_reset;	/* reset work queue           */
+	int wait_reset;			/* reset work queue condition */
+	int32_t rslt_reset;		/* reset response status      */
+					/* (SUCESS or FAILED)         */
+
+	/* for DMA_TO_DEVICE(1), DMA_FROM_DEVICE(2), DMA_NONE(3) 
+	   requests */
+	unsigned int sc_data_direction;
+	
+	/* Number of pieces of scatter-gather */
+	unsigned int nr_segments;
+
+	/* requested struct scsi_cmnd is stored from kernel */
+	unsigned long req_scsi_cmnd;
+	int gref[VSCSIIF_SG_TABLESIZE];
+};
+
+struct vscsifrnt_info {
+	struct xenbus_device *dev;
+
+	struct Scsi_Host *host;
+
+	spinlock_t io_lock;
+	spinlock_t shadow_lock;
+	unsigned int evtchn;
+	unsigned int irq;
+
+	grant_ref_t ring_ref;
+	struct vscsiif_front_ring ring;
+	struct vscsiif_response	ring_res;
+
+	struct vscsifrnt_shadow shadow[VSCSIIF_MAX_REQS];
+	uint32_t shadow_free;
+
+	struct task_struct *kthread;
+	wait_queue_head_t wq;
+	unsigned int waiting_resp;
+
+};
+
+#define DPRINTK(_f, _a...)				\
+	pr_debug("(file=%s, line=%d) " _f,	\
+		 __FILE__ , __LINE__ , ## _a )
+
+int scsifront_xenbus_init(void);
+void scsifront_xenbus_unregister(void);
+int scsifront_schedule(void *data);
+irqreturn_t scsifront_intr(int irq, void *dev_id);
+int scsifront_cmd_done(struct vscsifrnt_info *info);
+
+
+#endif /* __XEN_DRIVERS_SCSIFRONT_H__  */
diff --git a/drivers/xen/scsifront/scsifront.c b/drivers/xen/scsifront/scsifront.c
new file mode 100644
index 0000000..9d19419
--- /dev/null
+++ b/drivers/xen/scsifront/scsifront.c
@@ -0,0 +1,478 @@
+/*
+ * Xen SCSI frontend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+ 
+
+#include <linux/version.h>
+#include "common.h"
+
+static int get_id_from_freelist(struct vscsifrnt_info *info)
+{
+	unsigned long flags;
+	uint32_t free;
+
+	spin_lock_irqsave(&info->shadow_lock, flags);
+
+	free = info->shadow_free;
+	BUG_ON(free > VSCSIIF_MAX_REQS);
+	info->shadow_free = info->shadow[free].next_free;
+	info->shadow[free].next_free = 0x0fff;
+
+	info->shadow[free].wait_reset = 0;
+
+	spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+	return free;
+}
+
+static void add_id_to_freelist(struct vscsifrnt_info *info, uint32_t id)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->shadow_lock, flags);
+
+	info->shadow[id].next_free  = info->shadow_free;
+	info->shadow[id].req_scsi_cmnd = 0;
+	info->shadow_free = id;
+
+	spin_unlock_irqrestore(&info->shadow_lock, flags);
+}
+
+
+struct vscsiif_request * scsifront_pre_request(struct vscsifrnt_info *info)
+{
+	struct vscsiif_front_ring *ring = &(info->ring);
+	vscsiif_request_t *ring_req;
+	uint32_t id;
+
+	ring_req = RING_GET_REQUEST(&(info->ring), ring->req_prod_pvt);
+
+	ring->req_prod_pvt++;
+	
+	id = get_id_from_freelist(info);	/* use id by response */
+	ring_req->rqid = (uint16_t)id;
+
+	return ring_req;
+}
+
+
+static void scsifront_notify_work(struct vscsifrnt_info *info)
+{
+	info->waiting_resp = 1;
+	wake_up(&info->wq);
+}
+
+
+static void scsifront_do_request(struct vscsifrnt_info *info)
+{
+	struct vscsiif_front_ring *ring = &(info->ring);
+	unsigned int irq = info->irq;
+	int notify;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(ring, notify);
+	if (notify)
+		notify_remote_via_irq(irq);
+}
+
+irqreturn_t scsifront_intr(int irq, void *dev_id)
+{
+	scsifront_notify_work((struct vscsifrnt_info *)dev_id);
+	return IRQ_HANDLED;
+}
+
+
+static void scsifront_gnttab_done(struct vscsifrnt_shadow *s, uint32_t id)
+{
+	int i;
+
+	if (s->sc_data_direction == DMA_NONE)
+		return;
+
+	if (s->nr_segments) {
+		for (i = 0; i < s->nr_segments; i++) {
+			if (unlikely(gnttab_query_foreign_access(
+				s->gref[i]) != 0)) {
+				pr_alert("scsifront: "
+					 "grant still in use by backend\n");
+				BUG();
+			}
+			gnttab_end_foreign_access(s->gref[i], 0UL);
+		}
+	}
+
+	return;
+}
+
+
+static void scsifront_cdb_cmd_done(struct vscsifrnt_info *info,
+		       vscsiif_response_t *ring_res)
+{
+	struct scsi_cmnd *sc;
+	uint32_t id;
+	uint8_t sense_len;
+
+	id = ring_res->rqid;
+	sc = (struct scsi_cmnd *)info->shadow[id].req_scsi_cmnd;
+
+	if (sc == NULL)
+		BUG();
+
+	scsifront_gnttab_done(&info->shadow[id], id);
+	add_id_to_freelist(info, id);
+
+	sc->result = ring_res->rslt;
+	scsi_set_resid(sc, ring_res->residual_len);
+
+	if (ring_res->sense_len > VSCSIIF_SENSE_BUFFERSIZE)
+		sense_len = VSCSIIF_SENSE_BUFFERSIZE;
+	else
+		sense_len = ring_res->sense_len;
+
+	if (sense_len)
+		memcpy(sc->sense_buffer, ring_res->sense_buffer, sense_len);
+
+	sc->scsi_done(sc);
+
+	return;
+}
+
+
+static void scsifront_sync_cmd_done(struct vscsifrnt_info *info,
+				vscsiif_response_t *ring_res)
+{
+	uint16_t id = ring_res->rqid;
+	unsigned long flags;
+	
+	spin_lock_irqsave(&info->shadow_lock, flags);
+	info->shadow[id].wait_reset = 1;
+	info->shadow[id].rslt_reset = ring_res->rslt;
+	spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+	wake_up(&(info->shadow[id].wq_reset));
+}
+
+
+int scsifront_cmd_done(struct vscsifrnt_info *info)
+{
+	vscsiif_response_t *ring_res;
+
+	RING_IDX i, rp;
+	int more_to_do = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->io_lock, flags);
+
+	rp = info->ring.sring->rsp_prod;
+	rmb();
+	for (i = info->ring.rsp_cons; i != rp; i++) {
+		
+		ring_res = RING_GET_RESPONSE(&info->ring, i);
+
+		if (info->shadow[ring_res->rqid].act == VSCSIIF_ACT_SCSI_CDB)
+			scsifront_cdb_cmd_done(info, ring_res);
+		else
+			scsifront_sync_cmd_done(info, ring_res);
+	}
+
+	info->ring.rsp_cons = i;
+
+	if (i != info->ring.req_prod_pvt) {
+		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+	} else {
+		info->ring.sring->rsp_event = i + 1;
+	}
+
+	spin_unlock_irqrestore(&info->io_lock, flags);
+
+
+	/* Yield point for this unbounded loop. */
+	cond_resched();
+
+	return more_to_do;
+}
+
+
+
+
+int scsifront_schedule(void *data)
+{
+	struct vscsifrnt_info *info = (struct vscsifrnt_info *)data;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(
+			info->wq,
+			info->waiting_resp || kthread_should_stop());
+
+		info->waiting_resp = 0;
+		smp_mb();
+
+		if (scsifront_cmd_done(info))
+			info->waiting_resp = 1;
+	}
+
+	return 0;
+}
+
+
+
+static int map_data_for_request(struct vscsifrnt_info *info,
+		struct scsi_cmnd *sc, vscsiif_request_t *ring_req, uint32_t id)
+{
+	grant_ref_t gref_head;
+	struct page *page;
+	int err, ref, ref_cnt = 0;
+	int write = (sc->sc_data_direction == DMA_TO_DEVICE);
+	unsigned int i, nr_pages, off, len, bytes;
+	unsigned long buffer_pfn;
+
+	if (sc->sc_data_direction == DMA_NONE)
+		return 0;
+
+	err = gnttab_alloc_grant_references(VSCSIIF_SG_TABLESIZE, &gref_head);
+	if (err) {
+		pr_err("scsifront: gnttab_alloc_grant_references() error\n");
+		return -ENOMEM;
+	}
+
+	if (scsi_bufflen(sc)) {
+		/* quoted scsi_lib.c/scsi_req_map_sg . */
+		struct scatterlist *sg, *sgl = scsi_sglist(sc);
+		unsigned int data_len = scsi_bufflen(sc);
+
+		nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (nr_pages > VSCSIIF_SG_TABLESIZE) {
+			pr_err("scsifront: Unable to map request_buffer for command!\n");
+			ref_cnt = (-E2BIG);
+			goto big_to_sg;
+		}
+
+		for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
+			page = sg_page(sg);
+			off = sg->offset;
+			len = sg->length;
+
+			buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
+
+			while (len > 0 && data_len > 0) {
+				/*
+				 * sg sends a scatterlist that is larger than
+				 * the data_len it wants transferred for certain
+				 * IO sizes
+				 */
+				bytes = min_t(unsigned int, len, PAGE_SIZE - off);
+				bytes = min(bytes, data_len);
+				
+				ref = gnttab_claim_grant_reference(&gref_head);
+				BUG_ON(ref == -ENOSPC);
+
+				gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
+					buffer_pfn, write);
+
+				info->shadow[id].gref[ref_cnt]  = ref;
+				ring_req->seg[ref_cnt].gref     = ref;
+				ring_req->seg[ref_cnt].offset   = (uint16_t)off;
+				ring_req->seg[ref_cnt].length   = (uint16_t)bytes;
+
+				buffer_pfn++;
+				len -= bytes;
+				data_len -= bytes;
+				off = 0;
+				ref_cnt++;
+			}
+		}
+	}
+
+big_to_sg:
+
+	gnttab_free_grant_references(gref_head);
+
+	return ref_cnt;
+}
+
+static int scsifront_queuecommand(struct Scsi_Host *shost,
+				  struct scsi_cmnd *sc)
+{
+	struct vscsifrnt_info *info = shost_priv(shost);
+	vscsiif_request_t *ring_req;
+	unsigned long flags;
+	int ref_cnt;
+	uint16_t rqid;
+
+/* debug printk to identify more missing scsi commands
+	printk(KERN_INFO "scsicmd: len=%i, 0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x",sc->cmd_len,
+		sc->cmnd[0],sc->cmnd[1],sc->cmnd[2],sc->cmnd[3],sc->cmnd[4],
+		sc->cmnd[5],sc->cmnd[6],sc->cmnd[7],sc->cmnd[8],sc->cmnd[9]);
+*/
+	spin_lock_irqsave(shost->host_lock, flags);
+	scsi_cmd_get_serial(shost, sc);
+	if (RING_FULL(&info->ring)) {
+		spin_unlock_irqrestore(shost->host_lock, flags);
+		return SCSI_MLQUEUE_HOST_BUSY;
+	}
+
+	sc->result    = 0;
+
+	ring_req          = scsifront_pre_request(info);
+	rqid              = ring_req->rqid;
+	ring_req->act     = VSCSIIF_ACT_SCSI_CDB;
+
+	ring_req->id      = sc->device->id;
+	ring_req->lun     = sc->device->lun;
+	ring_req->channel = sc->device->channel;
+	ring_req->cmd_len = sc->cmd_len;
+
+	BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE);
+
+	if ( sc->cmd_len )
+		memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
+	else
+		memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE);
+
+	ring_req->sc_data_direction   = (uint8_t)sc->sc_data_direction;
+	ring_req->timeout_per_command = (sc->request->timeout / HZ);
+
+	info->shadow[rqid].req_scsi_cmnd     = (unsigned long)sc;
+	info->shadow[rqid].sc_data_direction = sc->sc_data_direction;
+	info->shadow[rqid].act               = ring_req->act;
+
+	ref_cnt = map_data_for_request(info, sc, ring_req, rqid);
+	if (ref_cnt < 0) {
+		add_id_to_freelist(info, rqid);
+		spin_unlock_irqrestore(shost->host_lock, flags);
+		if (ref_cnt == (-ENOMEM))
+			return SCSI_MLQUEUE_HOST_BUSY;
+		sc->result = (DID_ERROR << 16);
+		sc->scsi_done(sc);
+		return 0;
+	}
+
+	ring_req->nr_segments          = (uint8_t)ref_cnt;
+	info->shadow[rqid].nr_segments = ref_cnt;
+
+	scsifront_do_request(info);
+	spin_unlock_irqrestore(shost->host_lock, flags);
+
+	return 0;
+}
+
+
+static int scsifront_eh_abort_handler(struct scsi_cmnd *sc)
+{
+	return (FAILED);
+}
+
+/* vscsi supports only device_reset, because it is each of LUNs */
+static int scsifront_dev_reset_handler(struct scsi_cmnd *sc)
+{
+	struct Scsi_Host *host = sc->device->host;
+	struct vscsifrnt_info *info = shost_priv(host);
+
+	vscsiif_request_t *ring_req;
+	uint16_t rqid;
+	int err;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)
+	spin_lock_irq(host->host_lock);
+#endif
+
+	ring_req      = scsifront_pre_request(info);
+	ring_req->act = VSCSIIF_ACT_SCSI_RESET;
+
+	rqid          = ring_req->rqid;
+	info->shadow[rqid].act = VSCSIIF_ACT_SCSI_RESET;
+
+	ring_req->channel = sc->device->channel;
+	ring_req->id      = sc->device->id;
+	ring_req->lun     = sc->device->lun;
+	ring_req->cmd_len = sc->cmd_len;
+
+	if ( sc->cmd_len )
+		memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
+	else
+		memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE);
+
+	ring_req->sc_data_direction   = (uint8_t)sc->sc_data_direction;
+	ring_req->timeout_per_command = (sc->request->timeout / HZ);
+	ring_req->nr_segments         = 0;
+
+	scsifront_do_request(info);	
+
+	spin_unlock_irq(host->host_lock);
+	wait_event_interruptible(info->shadow[rqid].wq_reset,
+			 info->shadow[rqid].wait_reset);
+	spin_lock_irq(host->host_lock);
+
+	err = info->shadow[rqid].rslt_reset;
+
+	add_id_to_freelist(info, rqid);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)
+	spin_unlock_irq(host->host_lock);
+#endif
+	return (err);
+}
+
+
+struct scsi_host_template scsifront_sht = {
+	.module			= THIS_MODULE,
+	.name			= "Xen SCSI frontend driver",
+	.queuecommand		= scsifront_queuecommand,
+	.eh_abort_handler	= scsifront_eh_abort_handler,
+	.eh_device_reset_handler= scsifront_dev_reset_handler,
+	.cmd_per_lun		= VSCSIIF_DEFAULT_CMD_PER_LUN,
+	.can_queue		= VSCSIIF_MAX_REQS,
+	.this_id 		= -1,
+	.sg_tablesize		= VSCSIIF_SG_TABLESIZE,
+	.use_clustering		= DISABLE_CLUSTERING,
+	.proc_name		= "scsifront",
+};
+
+
+static int __init scsifront_init(void)
+{
+	int err;
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	err = scsifront_xenbus_init();
+
+	return err;
+}
+
+static void __exit scsifront_exit(void)
+{
+	scsifront_xenbus_unregister();
+}
+
+module_init(scsifront_init);
+module_exit(scsifront_exit);
+
+MODULE_DESCRIPTION("Xen SCSI frontend driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/scsifront/xenbus.c b/drivers/xen/scsifront/xenbus.c
new file mode 100644
index 0000000..4d01d1b
--- /dev/null
+++ b/drivers/xen/scsifront/xenbus.c
@@ -0,0 +1,424 @@
+/*
+ * Xen SCSI frontend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+* Patched to support >2TB drives
+* 2010, Samuel Kvasnica, IMS Nanofabrication AG
+*/
+
+#include <linux/version.h>
+#include <linux/slab.h>
+#include "common.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
+  #define DEFAULT_TASK_COMM_LEN	16
+#else
+  #define DEFAULT_TASK_COMM_LEN	TASK_COMM_LEN
+#endif
+
+extern struct scsi_host_template scsifront_sht;
+
+static void scsifront_free(struct vscsifrnt_info *info)
+{
+	struct Scsi_Host *host = info->host;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
+	if (host->shost_state != SHOST_DEL) {
+#else
+	if (!test_bit(SHOST_DEL, &host->shost_state)) {
+#endif
+		scsi_remove_host(info->host);
+	}
+
+	if (info->ring_ref != GRANT_INVALID_REF) {
+		gnttab_end_foreign_access(info->ring_ref,
+					(unsigned long)info->ring.sring);
+		info->ring_ref = GRANT_INVALID_REF;
+		info->ring.sring = NULL;
+	}
+
+	if (info->irq)
+		unbind_from_irqhandler(info->irq, info);
+	info->irq = 0;
+
+	scsi_host_put(info->host);
+}
+
+
+static int scsifront_alloc_ring(struct vscsifrnt_info *info)
+{
+	struct xenbus_device *dev = info->dev;
+	struct vscsiif_sring *sring;
+	int err = -ENOMEM;
+
+
+	info->ring_ref = GRANT_INVALID_REF;
+
+	/***** Frontend to Backend ring start *****/
+	sring = (struct vscsiif_sring *) __get_free_page(GFP_KERNEL);
+	if (!sring) {
+		xenbus_dev_fatal(dev, err, "fail to allocate shared ring (Front to Back)");
+		return err;
+	}
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(sring));
+	if (err < 0) {
+		free_page((unsigned long) sring);
+		info->ring.sring = NULL;
+		xenbus_dev_fatal(dev, err, "fail to grant shared ring (Front to Back)");
+		goto free_sring;
+	}
+	info->ring_ref = err;
+
+	err = bind_listening_port_to_irqhandler(
+			dev->otherend_id, scsifront_intr,
+			IRQF_SAMPLE_RANDOM, "scsifront", info);
+
+	if (err <= 0) {
+		xenbus_dev_fatal(dev, err, "bind_listening_port_to_irqhandler");
+		goto free_sring;
+	}
+	info->irq = err;
+
+	return 0;
+
+/* free resource */
+free_sring:
+	scsifront_free(info);
+
+	return err;
+}
+
+
+static int scsifront_init_ring(struct vscsifrnt_info *info)
+{
+	struct xenbus_device *dev = info->dev;
+	struct xenbus_transaction xbt;
+	int err;
+
+	DPRINTK("%s\n",__FUNCTION__);
+
+	err = scsifront_alloc_ring(info);
+	if (err)
+		return err;
+	DPRINTK("%u %u\n", info->ring_ref, info->evtchn);
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u",
+				info->ring_ref);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "%s", "writing ring-ref");
+		goto fail;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+				irq_to_evtchn_port(info->irq));
+
+	if (err) {
+		xenbus_dev_fatal(dev, err, "%s", "writing event-channel");
+		goto fail;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto free_sring;
+	}
+
+	return 0;
+
+fail:
+	xenbus_transaction_end(xbt, 1);
+free_sring:
+	/* free resource */
+	scsifront_free(info);
+	
+	return err;
+}
+
+
+static int scsifront_probe(struct xenbus_device *dev,
+				const struct xenbus_device_id *id)
+{
+	struct vscsifrnt_info *info;
+	struct Scsi_Host *host;
+	int i, err = -ENOMEM;
+	char name[DEFAULT_TASK_COMM_LEN];
+
+	host = scsi_host_alloc(&scsifront_sht, sizeof(*info));
+	if (!host) {
+		xenbus_dev_fatal(dev, err, "fail to allocate scsi host");
+		return err;
+	}
+	info = (struct vscsifrnt_info *) host->hostdata;
+	info->host = host;
+
+
+	dev_set_drvdata(&dev->dev, info);
+	info->dev  = dev;
+
+	for (i = 0; i < VSCSIIF_MAX_REQS; i++) {
+		info->shadow[i].next_free = i + 1;
+		init_waitqueue_head(&(info->shadow[i].wq_reset));
+		info->shadow[i].wait_reset = 0;
+	}
+	info->shadow[VSCSIIF_MAX_REQS - 1].next_free = 0x0fff;
+
+	err = scsifront_init_ring(info);
+	if (err) {
+		scsi_host_put(host);
+		return err;
+	}
+
+	init_waitqueue_head(&info->wq);
+	spin_lock_init(&info->io_lock);
+	spin_lock_init(&info->shadow_lock);
+
+	snprintf(name, DEFAULT_TASK_COMM_LEN, "vscsiif.%d", info->host->host_no);
+
+	info->kthread = kthread_run(scsifront_schedule, info, name);
+	if (IS_ERR(info->kthread)) {
+		err = PTR_ERR(info->kthread);
+		info->kthread = NULL;
+		pr_err("scsifront: kthread start err %d\n", err);
+		goto free_sring;
+	}
+
+	host->max_id      = VSCSIIF_MAX_TARGET;
+	host->max_channel = 0;
+	host->max_lun     = VSCSIIF_MAX_LUN;
+	host->max_sectors = (VSCSIIF_SG_TABLESIZE - 1) * PAGE_SIZE / 512;
+	host->max_cmd_len = VSCSIIF_MAX_COMMAND_SIZE;
+
+	err = scsi_add_host(host, &dev->dev);
+	if (err) {
+		pr_err("scsifront: fail to add scsi host %d\n", err);
+		goto free_sring;
+	}
+
+	xenbus_switch_state(dev, XenbusStateInitialised);
+
+	return 0;
+
+free_sring:
+	/* free resource */
+	scsifront_free(info);
+	return err;
+}
+
+static int scsifront_remove(struct xenbus_device *dev)
+{
+	struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("%s: %s removed\n",__FUNCTION__ ,dev->nodename);
+
+	if (info->kthread) {
+		kthread_stop(info->kthread);
+		info->kthread = NULL;
+	}
+
+	scsifront_free(info);
+	
+	return 0;
+}
+
+
+static int scsifront_disconnect(struct vscsifrnt_info *info)
+{
+	struct xenbus_device *dev = info->dev;
+	struct Scsi_Host *host = info->host;
+
+	DPRINTK("%s: %s disconnect\n",__FUNCTION__ ,dev->nodename);
+
+	/* 
+	  When this function is executed,  all devices of 
+	  Frontend have been deleted. 
+	  Therefore, it need not block I/O before remove_host.
+	*/
+
+	scsi_remove_host(host);
+	xenbus_frontend_closed(dev);
+
+	return 0;
+}
+
+#define VSCSIFRONT_OP_ADD_LUN	1
+#define VSCSIFRONT_OP_DEL_LUN	2
+
+static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op)
+{
+	struct xenbus_device *dev = info->dev;
+	int i, err = 0;
+	char str[64], state_str[64];
+	char **dir;
+	unsigned int dir_n = 0;
+	unsigned int device_state;
+	unsigned int hst, chn, tgt, lun;
+	struct scsi_device *sdev;
+
+	dir = xenbus_directory(XBT_NIL, dev->otherend, "vscsi-devs", &dir_n);
+	if (IS_ERR(dir))
+		return;
+
+	for (i = 0; i < dir_n; i++) {
+		/* read status */
+		snprintf(str, sizeof(str), "vscsi-devs/%s/state", dir[i]);
+		err = xenbus_scanf(XBT_NIL, dev->otherend, str, "%u",
+			&device_state);
+		if (XENBUS_EXIST_ERR(err))
+			continue;
+		
+		/* virtual SCSI device */
+		snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]);
+		err = xenbus_scanf(XBT_NIL, dev->otherend, str,
+			"%u:%u:%u:%u", &hst, &chn, &tgt, &lun);
+		if (XENBUS_EXIST_ERR(err))
+			continue;
+
+		/* front device state path */
+		snprintf(state_str, sizeof(state_str), "vscsi-devs/%s/state", dir[i]);
+
+		switch (op) {
+		case VSCSIFRONT_OP_ADD_LUN:
+			if (device_state == XenbusStateInitialised) {
+				sdev = scsi_device_lookup(info->host, chn, tgt, lun);
+				if (sdev) {
+					pr_err("scsifront: Device already in use.\n");
+					scsi_device_put(sdev);
+					xenbus_printf(XBT_NIL, dev->nodename,
+						state_str, "%d", XenbusStateClosed);
+				} else {
+					scsi_add_device(info->host, chn, tgt, lun);
+					xenbus_printf(XBT_NIL, dev->nodename,
+						state_str, "%d", XenbusStateConnected);
+				}
+			}
+			break;
+		case VSCSIFRONT_OP_DEL_LUN:
+			if (device_state == XenbusStateClosing) {
+				sdev = scsi_device_lookup(info->host, chn, tgt, lun);
+				if (sdev) {
+					scsi_remove_device(sdev);
+					scsi_device_put(sdev);
+					xenbus_printf(XBT_NIL, dev->nodename,
+						state_str, "%d", XenbusStateClosed);
+				}
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	
+	kfree(dir);
+	return;
+}
+
+
+
+
+static void scsifront_backend_changed(struct xenbus_device *dev,
+				enum xenbus_state backend_state)
+{
+	struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
+
+	DPRINTK("%p %u %u\n", dev, dev->state, backend_state);
+
+	switch (backend_state) {
+	case XenbusStateUnknown:
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateInitialised:
+		break;
+
+	case XenbusStateConnected:
+		if (xenbus_read_driver_state(dev->nodename) ==
+			XenbusStateInitialised) {
+			scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
+		}
+		
+		if (dev->state == XenbusStateConnected)
+			break;
+			
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosing:
+		scsifront_disconnect(info);
+		break;
+
+	case XenbusStateReconfiguring:
+		scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_DEL_LUN);
+		xenbus_switch_state(dev, XenbusStateReconfiguring);
+		break;
+
+	case XenbusStateReconfigured:
+		scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+	}
+}
+
+
+static const struct xenbus_device_id scsifront_ids[] = {
+	{ "vscsi" },
+	{ "" }
+};
+MODULE_ALIAS("xen:vscsi");
+
+static DEFINE_XENBUS_DRIVER(scsifront, ,
+	.probe			= scsifront_probe,
+	.remove			= scsifront_remove,
+/* 	.resume			= scsifront_resume, */
+	.otherend_changed	= scsifront_backend_changed,
+);
+
+int scsifront_xenbus_init(void)
+{
+	return xenbus_register_frontend(&scsifront_driver);
+}
+
+void scsifront_xenbus_unregister(void)
+{
+	xenbus_unregister_driver(&scsifront_driver);
+}
+
diff --git a/drivers/xen/sfc_netback/Makefile b/drivers/xen/sfc_netback/Makefile
new file mode 100644
index 0000000..1286c3a
--- /dev/null
+++ b/drivers/xen/sfc_netback/Makefile
@@ -0,0 +1,12 @@
+EXTRA_CFLAGS += -Idrivers/xen/sfc_netback -Idrivers/xen/sfc_netutil -Idrivers/xen/netback -Idrivers/net/sfc -Idrivers/net/sfc/sfc_resource
+EXTRA_CFLAGS += -D__ci_driver__ 
+EXTRA_CFLAGS += -DEFX_USE_KCOMPAT
+EXTRA_CFLAGS += -Werror
+
+ifdef GCOV
+EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
+endif
+
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) := sfc_netback.o
+
+sfc_netback-objs   := accel.o accel_fwd.o accel_msg.o accel_solarflare.o accel_xenbus.o accel_debugfs.o
diff --git a/drivers/xen/sfc_netback/accel.c b/drivers/xen/sfc_netback/accel.c
new file mode 100644
index 0000000..21367f2
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel.c
@@ -0,0 +1,147 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+#include "accel_solarflare.h"
+
+#include <linux/notifier.h>
+
+#ifdef EFX_GCOV
+#include "gcov.h"
+#endif
+
+static int netback_accel_netdev_event(struct notifier_block *nb,
+				      unsigned long event, void *ptr)
+{
+	struct net_device *net_dev = (struct net_device *)ptr;
+	struct netback_accel *bend;
+
+	if ((event == NETDEV_UP) || 
+	    (event == NETDEV_DOWN) ||
+	    (event == NETDEV_CHANGE)) {
+		mutex_lock(&bend_list_mutex);
+		bend = bend_list;
+		while (bend != NULL) {
+			mutex_lock(&bend->bend_mutex);
+			/*
+			 * This happens when the shared pages have
+			 * been unmapped, but the bend not yet removed
+			 * from list
+			 */
+			if (bend->shared_page == NULL)
+				goto next;
+
+			if (bend->net_dev->ifindex == net_dev->ifindex) {
+				int ok;
+				if (event == NETDEV_CHANGE)
+					ok = (netif_carrier_ok(net_dev) && 
+					      (net_dev->flags & IFF_UP));
+				else
+					ok = (netif_carrier_ok(net_dev) && 
+					      (event == NETDEV_UP));
+				netback_accel_set_interface_state(bend, ok);
+			}
+
+		next:
+			mutex_unlock(&bend->bend_mutex);
+			bend = bend->next_bend;
+		}
+		mutex_unlock(&bend_list_mutex);
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+static struct notifier_block netback_accel_netdev_notifier = {
+	.notifier_call = netback_accel_netdev_event,
+};
+
+
+unsigned sfc_netback_max_pages = NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES;
+module_param_named(max_pages, sfc_netback_max_pages, uint, 0644);
+MODULE_PARM_DESC(max_pages, 
+		 "The number of buffer pages to enforce on each guest");
+
+/* Initialise subsystems need for the accelerated fast path */
+static int __init netback_accel_init(void)
+{
+	int rc = 0;
+
+#ifdef EFX_GCOV
+	gcov_provider_init(THIS_MODULE);
+#endif
+
+	rc = netback_accel_init_fwd();
+	if (rc != 0)
+		goto fail0;
+
+	netback_accel_debugfs_init();
+
+	rc = netback_accel_sf_init();
+	if (rc != 0)
+		goto fail1;
+
+	rc = register_netdevice_notifier
+		(&netback_accel_netdev_notifier);
+	if (rc != 0)
+		goto fail2;
+
+	return 0;
+
+ fail2:
+	netback_accel_sf_shutdown();
+ fail1:
+	netback_accel_debugfs_fini();
+	netback_accel_shutdown_fwd();
+ fail0:
+#ifdef EFX_GCOV
+	gcov_provider_fini(THIS_MODULE);
+#endif
+	return rc;
+}
+
+module_init(netback_accel_init);
+
+static void __exit netback_accel_exit(void)
+{
+	unregister_netdevice_notifier(&netback_accel_netdev_notifier);
+
+	netback_accel_sf_shutdown();
+
+	netback_accel_shutdown_bends();
+
+	netback_accel_debugfs_fini();
+
+	netback_accel_shutdown_fwd();
+
+#ifdef EFX_GCOV
+	gcov_provider_fini(THIS_MODULE);
+#endif
+}
+
+module_exit(netback_accel_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/sfc_netback/accel.h b/drivers/xen/sfc_netback/accel.h
new file mode 100644
index 0000000..f371a3e
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel.h
@@ -0,0 +1,392 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETBACK_ACCEL_H
+#define NETBACK_ACCEL_H
+
+#include <linux/version.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+
+#include <xen/xenbus.h>
+
+#include "accel_shared_fifo.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+
+/**************************************************************************
+ * Datatypes
+ **************************************************************************/
+
+#define NETBACK_ACCEL_DEFAULT_MAX_FILTERS (8)
+#define NETBACK_ACCEL_DEFAULT_MAX_MCASTS (8)
+#define NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES (384)
+/* Variable to store module parameter for max_buf_pages */
+extern unsigned sfc_netback_max_pages;
+
+#define NETBACK_ACCEL_STATS 1
+
+#if NETBACK_ACCEL_STATS
+#define NETBACK_ACCEL_STATS_OP(x) x
+#else
+#define NETBACK_ACCEL_STATS_OP(x)
+#endif
+
+/*! Statistics for a given backend */
+struct netback_accel_stats {
+	/*! Number of eventq wakeup events */
+	u64 evq_wakeups;
+	/*! Number of eventq timeout events */
+	u64 evq_timeouts;
+	/*! Number of filters used */
+	u32 num_filters;
+	/*! Number of buffer pages registered */
+	u32 num_buffer_pages;
+};
+
+
+/* Debug fs nodes for each of the above stats */
+struct netback_accel_dbfs {
+	struct dentry *evq_wakeups;
+	struct dentry *evq_timeouts;
+	struct dentry *num_filters;
+	struct dentry *num_buffer_pages;
+};
+
+
+/*! Resource limits for a given NIC */
+struct netback_accel_limits {
+	int max_filters;	    /*!< Max. number of filters to use. */
+	int max_mcasts;	     /*!< Max. number  of mcast subscriptions */
+	int max_buf_pages;	  /*!< Max. number of pages of NIC buffers */
+};
+
+
+/*! The state for an instance of the back end driver. */
+struct netback_accel {
+	/*! mutex to protect this state */
+	struct mutex bend_mutex;
+
+	/*! Watches on xenstore */
+	struct xenbus_watch domu_accel_watch;
+	struct xenbus_watch config_accel_watch;
+
+	/*! Pointer to whatever device cookie ties us in to the hypervisor */
+	void *hdev_data;
+
+	/*! FIFO indices. Next page is msg FIFOs */
+	struct net_accel_shared_page *shared_page;
+
+	/*! Defer control message processing */
+	struct work_struct handle_msg;
+
+	/*! Identifies other end VM and interface.*/
+	int far_end;
+	int vif_num;
+
+	/*!< To unmap the shared pages */
+	void *sh_pages_unmap;
+
+	/* Resource tracking */
+	/*! Limits on H/W & Dom0 resources */
+	struct netback_accel_limits quotas;
+
+	/* Hardware resources */
+	/*! The H/W type of associated NIC */
+	enum net_accel_hw_type hw_type;
+	/*! State of allocation */	       
+	int hw_state;
+	/*! How to set up the acceleration for this hardware */
+	int (*accel_setup)(struct netback_accel *); 
+	/*! And how to stop it. */
+	void (*accel_shutdown)(struct netback_accel *);
+
+	/*! The physical/real net_dev for this interface */
+	struct net_device *net_dev;
+
+	/*! Magic pointer to locate state in fowarding table */
+	void *fwd_priv;
+
+	/*! Message FIFO */
+	sh_msg_fifo2 to_domU;
+	/*! Message FIFO */
+	sh_msg_fifo2 from_domU;
+
+	/*! General notification channel id */
+	int msg_channel;
+	/*! General notification channel irq */
+	int msg_channel_irq;
+
+	/*! Event channel id dedicated to network packet interrupts. */
+	int net_channel; 
+	/*! Event channel irq dedicated to network packets interrupts */
+	int net_channel_irq; 
+
+	/*! The MAC address the frontend goes by. */
+	u8 mac[ETH_ALEN];
+	/*! Driver name of associated NIC */
+	char *nicname;    
+
+	/*! Array of pointers to buffer pages mapped */
+	grant_handle_t *buffer_maps; 
+	u64 *buffer_addrs;
+	/*! Index into buffer_maps */
+	int buffer_maps_index; 
+	/*! Max number of pages that domU is allowed/will request to map */
+	int max_pages; 
+
+	/*! Pointer to hardware specific private area */
+	void *accel_hw_priv; 
+
+	/*! Wait queue for changes in accelstate. */
+	wait_queue_head_t state_wait_queue;
+
+	/*! Current state of the frontend according to the xenbus
+	 *  watch. */
+	XenbusState frontend_state;
+
+	/*! Current state of this backend. */
+	XenbusState backend_state;
+
+	/*! Non-zero if the backend is being removed. */
+	int removing;
+
+	/*! Non-zero if the setup_vnic has been called. */
+	int vnic_is_setup;
+
+#if NETBACK_ACCEL_STATS
+	struct netback_accel_stats stats;
+#endif	
+#if defined(CONFIG_DEBUG_FS)
+	char *dbfs_dir_name;
+	struct dentry *dbfs_dir;
+	struct netback_accel_dbfs dbfs;
+#endif
+
+	/*! List */
+	struct netback_accel *next_bend;
+};
+
+
+/*
+ * Values for netback_accel.hw_state.  States of resource allocation
+ * we can go through
+ */
+/*! No hardware has yet been allocated. */
+#define NETBACK_ACCEL_RES_NONE  (0)
+/*! Hardware has been allocated. */
+#define NETBACK_ACCEL_RES_ALLOC (1)
+#define NETBACK_ACCEL_RES_FILTER (2)
+#define NETBACK_ACCEL_RES_HWINFO (3)
+
+/*! Filtering specification. This assumes that for VNIC support we
+ *  will always want wildcard entries, so only specifies the
+ *  destination IP/port
+ */
+struct netback_accel_filter_spec {
+	/*! Internal, used to access efx_vi API */
+	void *filter_handle; 
+
+	/*! Destination IP in network order */
+	u32 destip_be;
+	/*! Destination port in network order */
+	u16 destport_be;
+	/*! Mac address */
+	u8  mac[ETH_ALEN];
+	/*! TCP or UDP */
+	u8  proto;	
+};
+
+
+/**************************************************************************
+ * From accel.c
+ **************************************************************************/
+
+/*! \brief Start up all the acceleration plugins 
+ *
+ * \return 0 on success, an errno on failure
+ */
+extern int netback_accel_init_accel(void);
+
+/*! \brief Shut down all the acceleration plugins 
+ */
+extern void netback_accel_shutdown_accel(void);
+
+
+/**************************************************************************
+ * From accel_fwd.c
+ **************************************************************************/
+
+/*! \brief Init the forwarding infrastructure
+ * \return 0 on success, or -ENOMEM if it couldn't get memory for the
+ * forward table 
+ */
+extern int netback_accel_init_fwd(void);
+
+/*! \brief Shut down the forwarding and free memory. */
+extern void netback_accel_shutdown_fwd(void);
+
+/*! Initialise each nic port's fowarding table */
+extern void *netback_accel_init_fwd_port(void);
+extern void netback_accel_shutdown_fwd_port(void *fwd_priv);
+
+/*! \brief Add an entry to the forwarding table. 
+ * \param mac : MAC address, used as hash key
+ * \param ctxt : value to associate with key (can be NULL, see
+ * netback_accel_fwd_set_context)
+ * \return 0 on success, -ENOMEM if table was full and could no grow it
+ */
+extern int netback_accel_fwd_add(const __u8 *mac, void *context,
+				 void *fwd_priv);
+
+/*! \brief Remove an entry from the forwarding table. 
+ * \param mac : the MAC address to remove
+ * \return nothing: it is not an error if the mac was not in the table
+ */
+extern void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv);
+
+/*! \brief Set the context pointer for an existing fwd table entry.
+ * \param mac : key that is already present in the table
+ * \param context : new value to associate with key
+ * \return 0 on success, -ENOENT if mac not present in table.
+ */
+extern int netback_accel_fwd_set_context(const __u8 *mac, void *context,
+					 void *fwd_priv);
+
+/**************************************************************************
+ * From accel_msg.c
+ **************************************************************************/
+
+
+/*! \brief Send the start-of-day message that handshakes with the VNIC
+ *  and tells it its MAC address.
+ *
+ * \param bend The back end driver data structure
+ * \param version The version of communication to use, e.g. NET_ACCEL_MSG_VERSION
+ */
+extern void netback_accel_msg_tx_hello(struct netback_accel *bend,
+				       unsigned version);
+
+/*! \brief Send a "there's a new local mac address" message 
+ *
+ * \param bend The back end driver data structure for the vnic to send
+ * the message to 
+ * \param mac Pointer to the new mac address
+ */
+extern void netback_accel_msg_tx_new_localmac(struct netback_accel *bend,
+					      const void *mac);
+
+/*! \brief Send a "a mac address that was local has gone away" message 
+ *
+ * \param bend The back end driver data structure for the vnic to send
+ * the message to 
+ * \param mac Pointer to the old mac address
+ */
+extern void netback_accel_msg_tx_old_localmac(struct netback_accel *bend,
+					      const void *mac);
+
+extern void netback_accel_set_interface_state(struct netback_accel *bend,
+					      int up);
+
+/*! \brief Process the message queue for a bend that has just
+ * interrupted.
+ * 
+ * Demultiplexs an interrupt from the front end driver, taking
+ * messages from the fifo and taking appropriate action.
+ * 
+ * \param bend The back end driver data structure
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+extern void netback_accel_msg_rx_handler(struct work_struct *arg);
+#else
+extern void netback_accel_msg_rx_handler(void *bend_void);
+#endif
+
+/**************************************************************************
+ * From accel_xenbus.c
+ **************************************************************************/
+/*! List of all the bends currently in existence. */
+extern struct netback_accel *bend_list;
+extern struct mutex bend_list_mutex;
+
+/*! \brief Probe a new network interface. */
+extern int netback_accel_probe(struct xenbus_device *dev);
+
+/*! \brief Remove a network interface. */
+extern int netback_accel_remove(struct xenbus_device *dev);
+
+/*! \brief Shutdown all accelerator backends */
+extern void netback_accel_shutdown_bends(void);
+
+/*! \brief Initiate the xenbus state teardown handshake */
+extern void netback_accel_set_closing(struct netback_accel *bend);
+
+/**************************************************************************
+ * From accel_debugfs.c
+ **************************************************************************/
+/*! Global statistics */
+struct netback_accel_global_stats {
+	/*! Number of TX packets seen through driverlink */
+	u64 dl_tx_packets;
+	/*! Number of TX packets seen through driverlink we didn't like */
+	u64 dl_tx_bad_packets;
+	/*! Number of RX packets seen through driverlink */
+	u64 dl_rx_packets;
+	/*! Number of mac addresses we are forwarding to */
+	u32 num_fwds;
+};
+
+/*! Debug fs entries for each of the above stats */
+struct netback_accel_global_dbfs {
+	struct dentry *dl_tx_packets;
+	struct dentry *dl_tx_bad_packets;
+	struct dentry *dl_rx_packets;
+	struct dentry *num_fwds;
+};
+
+#if NETBACK_ACCEL_STATS
+extern struct netback_accel_global_stats global_stats;
+#endif
+
+/*! \brief Initialise the debugfs root and populate with global stats */
+extern void netback_accel_debugfs_init(void);
+
+/*! \brief Remove our debugfs root directory */
+extern void netback_accel_debugfs_fini(void);
+
+/*! \brief Add per-bend statistics to debug fs */
+extern int netback_accel_debugfs_create(struct netback_accel *bend);
+/*! \brief Remove per-bend statistics from debug fs */
+extern int netback_accel_debugfs_remove(struct netback_accel *bend);
+
+#endif /* NETBACK_ACCEL_H */
+
+
diff --git a/drivers/xen/sfc_netback/accel_debugfs.c b/drivers/xen/sfc_netback/accel_debugfs.c
new file mode 100644
index 0000000..6527c4b
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_debugfs.c
@@ -0,0 +1,148 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+
+#include "accel.h"
+
+#if defined(CONFIG_DEBUG_FS)
+static struct dentry *sfc_debugfs_root = NULL;
+#endif
+
+#if NETBACK_ACCEL_STATS
+struct netback_accel_global_stats global_stats;
+#if defined(CONFIG_DEBUG_FS)
+static struct netback_accel_global_dbfs  global_dbfs;
+#endif
+#endif
+
+void netback_accel_debugfs_init(void) 
+{
+#if defined(CONFIG_DEBUG_FS)
+	sfc_debugfs_root = debugfs_create_dir("sfc_netback", NULL);
+	if (sfc_debugfs_root == NULL)
+		return;
+
+	global_dbfs.num_fwds = debugfs_create_u32
+		("num_fwds", S_IRUSR | S_IRGRP | S_IROTH,
+		 sfc_debugfs_root, &global_stats.num_fwds);
+	global_dbfs.dl_tx_packets = debugfs_create_u64
+		("dl_tx_packets", S_IRUSR | S_IRGRP | S_IROTH,
+		 sfc_debugfs_root, &global_stats.dl_tx_packets);
+	global_dbfs.dl_rx_packets = debugfs_create_u64
+		("dl_rx_packets", S_IRUSR | S_IRGRP | S_IROTH,
+		 sfc_debugfs_root, &global_stats.dl_rx_packets);
+	global_dbfs.dl_tx_bad_packets = debugfs_create_u64
+		("dl_tx_bad_packets", S_IRUSR | S_IRGRP | S_IROTH,
+		 sfc_debugfs_root, &global_stats.dl_tx_bad_packets);
+#endif
+}
+
+
+void netback_accel_debugfs_fini(void)
+{
+#if defined(CONFIG_DEBUG_FS)
+	debugfs_remove(global_dbfs.num_fwds);
+	debugfs_remove(global_dbfs.dl_tx_packets);
+	debugfs_remove(global_dbfs.dl_rx_packets);
+	debugfs_remove(global_dbfs.dl_tx_bad_packets);
+
+	debugfs_remove(sfc_debugfs_root);
+#endif
+}
+
+
+int netback_accel_debugfs_create(struct netback_accel *bend)
+{
+#if defined(CONFIG_DEBUG_FS)
+	/* Smallest length is 7 (vif0.0\n) */
+	int length = 7, temp;
+
+	if (sfc_debugfs_root == NULL)
+		return -ENOENT;
+
+	/* Work out length of string representation of far_end and vif_num */
+	temp = bend->far_end;
+	while (temp > 9) {
+		length++;
+		temp = temp / 10;
+	}
+	temp = bend->vif_num;
+	while (temp > 9) {
+		length++;
+		temp = temp / 10;
+	}
+
+	bend->dbfs_dir_name = kmalloc(length, GFP_KERNEL);
+	if (bend->dbfs_dir_name == NULL)
+		return -ENOMEM;
+	sprintf(bend->dbfs_dir_name, "vif%d.%d", bend->far_end, bend->vif_num);
+
+	bend->dbfs_dir = debugfs_create_dir(bend->dbfs_dir_name, 
+					    sfc_debugfs_root);
+	if (bend->dbfs_dir == NULL) {
+		kfree(bend->dbfs_dir_name);
+		return -ENOMEM;
+	}
+
+#if NETBACK_ACCEL_STATS
+	bend->dbfs.evq_wakeups = debugfs_create_u64
+		("evq_wakeups", S_IRUSR | S_IRGRP | S_IROTH,
+		 bend->dbfs_dir, &bend->stats.evq_wakeups);
+	bend->dbfs.evq_timeouts = debugfs_create_u64
+		("evq_timeouts", S_IRUSR | S_IRGRP | S_IROTH,
+		 bend->dbfs_dir, &bend->stats.evq_timeouts);
+	bend->dbfs.num_filters = debugfs_create_u32
+		("num_filters", S_IRUSR | S_IRGRP | S_IROTH,
+		 bend->dbfs_dir, &bend->stats.num_filters);
+	bend->dbfs.num_buffer_pages = debugfs_create_u32
+		("num_buffer_pages", S_IRUSR | S_IRGRP | S_IROTH,
+		 bend->dbfs_dir, &bend->stats.num_buffer_pages);
+#endif
+#endif
+        return 0;
+}
+
+
+int netback_accel_debugfs_remove(struct netback_accel *bend)
+{
+#if defined(CONFIG_DEBUG_FS)
+	if (bend->dbfs_dir != NULL) {
+#if NETBACK_ACCEL_STATS
+		debugfs_remove(bend->dbfs.evq_wakeups);
+		debugfs_remove(bend->dbfs.evq_timeouts);
+		debugfs_remove(bend->dbfs.num_filters);
+		debugfs_remove(bend->dbfs.num_buffer_pages);
+#endif
+		debugfs_remove(bend->dbfs_dir);
+	}
+
+	if (bend->dbfs_dir_name)
+		kfree(bend->dbfs_dir_name);
+#endif
+        return 0;
+}
+
+
diff --git a/drivers/xen/sfc_netback/accel_fwd.c b/drivers/xen/sfc_netback/accel_fwd.c
new file mode 100644
index 0000000..385855a
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_fwd.c
@@ -0,0 +1,420 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include "accel.h"
+#include "accel_cuckoo_hash.h"
+#include "accel_util.h"
+#include "accel_solarflare.h"
+
+#include "driverlink_api.h"
+
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+
+/* State stored in the forward table */
+struct fwd_struct {
+	struct list_head link; /* Forms list */
+	void * context;
+	__u8 valid;
+	__u8 mac[ETH_ALEN];
+};
+
+/* Max value we support */
+#define NUM_FWDS_BITS 8
+#define NUM_FWDS (1 << NUM_FWDS_BITS)
+#define FWD_MASK (NUM_FWDS - 1)
+
+struct port_fwd {
+	/* Make a list */
+	struct list_head link;
+	/* Hash table to store the fwd_structs */
+	cuckoo_hash_table fwd_hash_table;
+	/* The array of fwd_structs */
+	struct fwd_struct *fwd_array;
+	/* Linked list of entries in use. */
+	struct list_head fwd_list;
+	/* Could do something clever with a reader/writer lock. */
+	spinlock_t fwd_lock;
+	/* Make find_free_entry() a bit faster by caching this */
+	int last_free_index;
+};
+
+/*
+ * This is unlocked as it's only called from dl probe and remove,
+ * which are themselves synchronised.  Could get rid of it entirely as
+ * it's never iterated, but useful for debug
+ */
+static struct list_head port_fwds;
+
+
+/* Search the fwd_array for an unused entry */
+static int fwd_find_free_entry(struct port_fwd *fwd_set)
+{
+	int index = fwd_set->last_free_index;
+
+	do {
+		if (!fwd_set->fwd_array[index].valid) {
+			fwd_set->last_free_index = index;
+			return index;
+		}
+		index++;
+		if (index >= NUM_FWDS)
+			index = 0;
+	} while (index != fwd_set->last_free_index);
+
+	return -ENOMEM;
+}
+
+
+/* Look up a MAC in the hash table. Caller should hold table lock. */
+static inline struct fwd_struct *fwd_find_entry(const __u8 *mac,
+						struct port_fwd *fwd_set)
+{
+	cuckoo_hash_value value;
+	cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
+
+	if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table,
+			       (cuckoo_hash_key *)(&key),
+			       &value)) {
+		struct fwd_struct *fwd = &fwd_set->fwd_array[value];
+		DPRINTK_ON(memcmp(fwd->mac, mac, ETH_ALEN) != 0);
+		return fwd;
+	}
+
+	return NULL;
+}
+
+
+/* Initialise each nic port's fowarding table */
+void *netback_accel_init_fwd_port(void) 
+{	
+	struct port_fwd *fwd_set;
+
+	fwd_set = kzalloc(sizeof(struct port_fwd), GFP_KERNEL);
+	if (fwd_set == NULL) {
+		return NULL;
+	}
+
+	spin_lock_init(&fwd_set->fwd_lock);
+	
+	fwd_set->fwd_array = kzalloc(sizeof (struct fwd_struct) * NUM_FWDS,
+				     GFP_KERNEL);
+	if (fwd_set->fwd_array == NULL) {
+		kfree(fwd_set);
+		return NULL;
+	}
+	
+	if (cuckoo_hash_init(&fwd_set->fwd_hash_table, NUM_FWDS_BITS, 8) != 0) {
+		kfree(fwd_set->fwd_array);
+		kfree(fwd_set);
+		return NULL;
+	}
+	
+	INIT_LIST_HEAD(&fwd_set->fwd_list);
+	
+	list_add(&fwd_set->link, &port_fwds);
+
+	return fwd_set;
+}
+
+
+void netback_accel_shutdown_fwd_port(void *fwd_priv)
+{
+	struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+	BUG_ON(fwd_priv == NULL);
+	
+	BUG_ON(list_empty(&port_fwds));
+	list_del(&fwd_set->link);
+
+	BUG_ON(!list_empty(&fwd_set->fwd_list));
+
+	cuckoo_hash_destroy(&fwd_set->fwd_hash_table);
+	kfree(fwd_set->fwd_array);
+	kfree(fwd_set);
+}
+
+
+int netback_accel_init_fwd()
+{
+	INIT_LIST_HEAD(&port_fwds);
+	return 0;
+}
+
+
+void netback_accel_shutdown_fwd()
+{
+	BUG_ON(!list_empty(&port_fwds));
+}
+
+
+/*
+ * Add an entry to the forwarding table.  Returns -ENOMEM if no
+ * space.
+ */
+int netback_accel_fwd_add(const __u8 *mac, void *context, void *fwd_priv)
+{
+	struct fwd_struct *fwd;
+	int rc = 0, index;
+	unsigned long flags;
+	cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
+	struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+	BUG_ON(fwd_priv == NULL);
+
+	DPRINTK("Adding mac %pM\n", mac);
+       
+	spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+	
+	if ((rc = fwd_find_free_entry(fwd_set)) < 0 ) {
+		spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+		return rc;
+	}
+
+	index = rc;
+
+	/* Shouldn't already be in the table */
+	if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table,
+			       (cuckoo_hash_key *)(&key), &rc) != 0) {
+		spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+		EPRINTK("MAC address %pM already accelerated.\n", mac);
+		return -EEXIST;
+	}
+
+	if ((rc = cuckoo_hash_add(&fwd_set->fwd_hash_table,
+				  (cuckoo_hash_key *)(&key), index, 1)) == 0) {
+		fwd = &fwd_set->fwd_array[index];
+		fwd->valid = 1;
+		fwd->context = context;
+		memcpy(fwd->mac, mac, ETH_ALEN);
+		list_add(&fwd->link, &fwd_set->fwd_list);
+		NETBACK_ACCEL_STATS_OP(global_stats.num_fwds++);
+	}
+
+	spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+
+	/*
+	 * No need to tell frontend that this mac address is local -
+	 * it should auto-discover through packets on fastpath what is
+	 * local and what is not, and just being on same server
+	 * doesn't make it local (it could be on a different
+	 * bridge)
+	 */
+
+	return rc;
+}
+
+
+/* remove an entry from the forwarding tables. */
+void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv)
+{
+	struct fwd_struct *fwd;
+	unsigned long flags;
+	cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
+	struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+	DPRINTK("Removing mac %pM\n", mac);
+
+	BUG_ON(fwd_priv == NULL);
+
+	spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+
+	fwd = fwd_find_entry(mac, fwd_set);
+	if (fwd != NULL) {
+		BUG_ON(list_empty(&fwd_set->fwd_list));
+		list_del(&fwd->link);
+
+		fwd->valid = 0;
+		cuckoo_hash_remove(&fwd_set->fwd_hash_table, 
+				   (cuckoo_hash_key *)(&key));
+		NETBACK_ACCEL_STATS_OP(global_stats.num_fwds--);
+	}
+	spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+
+	/*
+	 * No need to tell frontend that this is no longer present -
+	 * the frontend is currently only interested in remote
+	 * addresses and it works these out (mostly) by itself
+	 */
+}
+
+
+/* Set the context pointer for a hash table entry. */
+int netback_accel_fwd_set_context(const __u8 *mac, void *context, 
+				  void *fwd_priv)
+{
+	struct fwd_struct *fwd;
+	unsigned long flags;
+	int rc = -ENOENT;
+	struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+	BUG_ON(fwd_priv == NULL);
+
+	spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+	fwd = fwd_find_entry(mac, fwd_set);
+	if (fwd != NULL) {
+		fwd->context = context;
+		rc = 0;
+	}
+	spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+	return rc;
+}
+
+
+/**************************************************************************
+ * Process a received packet
+ **************************************************************************/
+
+/*
+ * Returns whether or not we have a match in our forward table for the
+ * this skb. Must be called with appropriate fwd_lock already held
+ */
+static struct netback_accel *for_a_vnic(struct netback_pkt_buf *skb, 
+					struct port_fwd *fwd_set)
+{
+	struct fwd_struct *fwd;
+	struct netback_accel *retval = NULL;
+
+	fwd = fwd_find_entry(skb->mac.raw, fwd_set);
+	if (fwd != NULL)
+		retval = fwd->context;
+	return retval;
+}
+
+
+static inline int packet_is_arp_reply(struct sk_buff *skb)
+{
+	return skb->protocol == ntohs(ETH_P_ARP) 
+		&& arp_hdr(skb)->ar_op == ntohs(ARPOP_REPLY);
+}
+
+
+static inline void hdr_to_filt(struct ethhdr *ethhdr, struct iphdr *ip,
+			       struct netback_accel_filter_spec *spec)
+{
+	spec->proto = ip->protocol;
+	spec->destip_be = ip->daddr;
+	memcpy(spec->mac, ethhdr->h_source, ETH_ALEN);
+
+	if (ip->protocol == IPPROTO_TCP) {
+		struct tcphdr *tcp = (struct tcphdr *)((char *)ip + 4 * ip->ihl);
+		spec->destport_be = tcp->dest;
+	} else {
+		struct udphdr *udp = (struct udphdr *)((char *)ip + 4 * ip->ihl);
+		EPRINTK_ON(ip->protocol != IPPROTO_UDP);
+		spec->destport_be = udp->dest;
+	}
+}
+
+
+static inline int netback_accel_can_filter(struct netback_pkt_buf *skb) 
+{
+	return (skb->protocol == htons(ETH_P_IP) && 
+		((skb->nh.iph->protocol == IPPROTO_TCP) ||
+		 (skb->nh.iph->protocol == IPPROTO_UDP)));
+}
+
+
+static inline void netback_accel_filter_packet(struct netback_accel *bend,
+					       struct netback_pkt_buf *skb)
+{
+	struct netback_accel_filter_spec fs;
+	struct ethhdr *eh = (struct ethhdr *)(skb->mac.raw);
+
+	hdr_to_filt(eh, skb->nh.iph, &fs);
+	
+	netback_accel_filter_check_add(bend, &fs);
+}
+
+
+/*
+ * Receive a packet and do something appropriate with it. Return true
+ * to take exclusive ownership of the packet.  This is verging on
+ * solarflare specific
+ */
+void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv)
+{
+	struct netback_accel *bend;
+	struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+	unsigned long flags;
+
+	BUG_ON(fwd_priv == NULL);
+
+	/* Checking for bcast is cheaper so do that first */
+	if (is_broadcast_ether_addr(skb->mac.raw)) {
+		/* pass through the slow path by not claiming ownership */
+		return;
+	} else if (is_multicast_ether_addr(skb->mac.raw)) {
+		/* pass through the slow path by not claiming ownership */
+		return;
+	} else {
+		/* It is unicast */
+		spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+		/* We insert filter to pass it off to a VNIC */
+		if ((bend = for_a_vnic(skb, fwd_set)) != NULL)
+			if (netback_accel_can_filter(skb))
+				netback_accel_filter_packet(bend, skb);
+		spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+	}
+	return;
+}
+
+
+void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv) 
+{
+	__u8 *mac;
+	unsigned long flags;
+	struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+	struct fwd_struct *fwd;
+
+	BUG_ON(fwd_priv == NULL);
+
+	if (is_broadcast_ether_addr(skb_mac_header(skb))
+	    && packet_is_arp_reply(skb)) {
+		/*
+		 * update our fast path forwarding to reflect this
+		 * gratuitous ARP
+		 */ 
+		mac = skb_mac_header(skb)+ETH_ALEN;
+
+		DPRINTK("%s: found gratuitous ARP for %pM\n",
+			__FUNCTION__, mac);
+
+		spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+		/*
+		 * Might not be local, but let's tell them all it is,
+		 * and they can restore the fastpath if they continue
+		 * to get packets that way
+		 */
+		list_for_each_entry(fwd, &fwd_set->fwd_list, link) {
+			struct netback_accel *bend = fwd->context;
+			if (bend != NULL)
+				netback_accel_msg_tx_new_localmac(bend, mac);
+		}
+
+		spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+	}
+	return;
+}
diff --git a/drivers/xen/sfc_netback/accel_msg.c b/drivers/xen/sfc_netback/accel_msg.c
new file mode 100644
index 0000000..b8982a71
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_msg.c
@@ -0,0 +1,391 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <xen/evtchn.h>
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+#include "accel_solarflare.h"
+
+/* Send a HELLO to front end to start things off */
+void netback_accel_msg_tx_hello(struct netback_accel *bend, unsigned version)
+{
+	unsigned long lock_state;
+	struct net_accel_msg *msg = 
+		net_accel_msg_start_send(bend->shared_page,
+					 &bend->to_domU, &lock_state);
+	/* The queue _cannot_ be full, we're the first users. */
+	EPRINTK_ON(msg == NULL);
+
+	if (msg != NULL) {
+		net_accel_msg_init(msg, NET_ACCEL_MSG_HELLO);
+		msg->u.hello.version = version;
+		msg->u.hello.max_pages = bend->quotas.max_buf_pages; 
+		VPRINTK("Sending hello to channel %d\n", bend->msg_channel);
+		net_accel_msg_complete_send_notify(bend->shared_page, 
+						   &bend->to_domU,
+						   &lock_state, 
+						   bend->msg_channel_irq);
+	}
+}
+
+/* Send a local mac message to vnic */
+static void netback_accel_msg_tx_localmac(struct netback_accel *bend, 
+					  int type, const void *mac)
+{
+	unsigned long lock_state;
+	struct net_accel_msg *msg;
+
+	BUG_ON(bend == NULL || mac == NULL);
+
+	VPRINTK("Sending local mac message: %pM\n", mac);
+	
+	msg = net_accel_msg_start_send(bend->shared_page, &bend->to_domU,
+				       &lock_state);
+	
+	if (msg != NULL) {
+		net_accel_msg_init(msg, NET_ACCEL_MSG_LOCALMAC);
+		msg->u.localmac.flags = type;
+		memcpy(msg->u.localmac.mac, mac, ETH_ALEN);
+		net_accel_msg_complete_send_notify(bend->shared_page, 
+						   &bend->to_domU,
+						   &lock_state, 
+						   bend->msg_channel_irq);
+	} else {
+		/*
+		 * TODO if this happens we may leave a domU
+		 * fastpathing packets when they should be delivered
+		 * locally.  Solution is get domU to timeout entries
+		 * in its fastpath lookup table when it receives no RX
+		 * traffic
+		 */
+		EPRINTK("%s: saw full queue, may need ARP timer to recover\n",
+			__FUNCTION__);
+	}
+}
+
+/* Send an add local mac message to vnic */
+void netback_accel_msg_tx_new_localmac(struct netback_accel *bend,
+				       const void *mac)
+{
+	netback_accel_msg_tx_localmac(bend, NET_ACCEL_MSG_ADD, mac);
+}
+
+
+static int netback_accel_msg_rx_buffer_map(struct netback_accel *bend, 
+					   struct net_accel_msg *msg)
+{
+	int log2_pages, rc;
+
+	/* Can only allocate in power of two */
+	log2_pages = log2_ge(msg->u.mapbufs.pages, 0);
+	if (msg->u.mapbufs.pages != pow2(log2_pages)) {
+		EPRINTK("%s: Can only alloc bufs in power of 2 sizes (%d)\n",
+			__FUNCTION__, msg->u.mapbufs.pages);
+		rc = -EINVAL;
+		goto err_out;
+	}
+  
+	/*
+	 * Sanity.  Assumes NET_ACCEL_MSG_MAX_PAGE_REQ is same for
+	 * both directions/domains
+	 */
+	if (msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ) {
+		EPRINTK("%s: too many pages in a single message: %d %d\n", 
+			__FUNCTION__, msg->u.mapbufs.pages,
+			NET_ACCEL_MSG_MAX_PAGE_REQ);
+		rc = -EINVAL;
+		goto err_out;
+	}
+  
+	if ((rc = netback_accel_add_buffers(bend, msg->u.mapbufs.pages, 
+					    log2_pages, msg->u.mapbufs.grants, 
+					    &msg->u.mapbufs.buf)) < 0) {
+		goto err_out;
+	}
+
+	msg->id |= NET_ACCEL_MSG_REPLY;
+  
+	return 0;
+
+ err_out:
+	EPRINTK("%s: err_out\n", __FUNCTION__);
+	msg->id |= NET_ACCEL_MSG_ERROR | NET_ACCEL_MSG_REPLY;
+	return rc;
+}
+
+
+/* Hint from frontend that one of our filters is out of date */
+static int netback_accel_process_fastpath(struct netback_accel *bend, 
+					  struct net_accel_msg *msg)
+{
+	struct netback_accel_filter_spec spec;
+
+	if (msg->u.fastpath.flags & NET_ACCEL_MSG_REMOVE) {
+		/* 
+		 * Would be nice to BUG() this but would leave us
+		 * vulnerable to naughty frontend
+		 */
+		EPRINTK_ON(msg->u.fastpath.flags & NET_ACCEL_MSG_ADD);
+		
+		memcpy(spec.mac, msg->u.fastpath.mac, ETH_ALEN);
+		spec.destport_be = msg->u.fastpath.port;
+		spec.destip_be = msg->u.fastpath.ip;
+		spec.proto = msg->u.fastpath.proto;
+
+		netback_accel_filter_remove_spec(bend, &spec);
+	}
+
+	return 0;
+}
+
+
+/* Flow control for message queues */
+inline void set_queue_not_full(struct netback_accel *bend)
+{
+	if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B, 
+			      (unsigned long *)&bend->shared_page->aflags))
+		notify_remote_via_irq(bend->msg_channel_irq);
+	else
+		VPRINTK("queue not full bit already set, not signalling\n");
+}
+
+
+/* Flow control for message queues */
+inline void set_queue_full(struct netback_accel *bend)
+{
+	if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B,
+			      (unsigned long *)&bend->shared_page->aflags))
+		notify_remote_via_irq(bend->msg_channel_irq);
+	else
+		VPRINTK("queue full bit already set, not signalling\n");
+}
+
+
+void netback_accel_set_interface_state(struct netback_accel *bend, int up)
+{
+	bend->shared_page->net_dev_up = up;
+	if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B, 
+			     (unsigned long *)&bend->shared_page->aflags))
+		notify_remote_via_irq(bend->msg_channel_irq);
+	else
+		VPRINTK("interface up/down bit already set, not signalling\n");
+}
+
+
+static int check_rx_hello_version(unsigned version) 
+{
+	/* Should only happen if there's been a version mismatch */
+	BUG_ON(version == NET_ACCEL_MSG_VERSION);
+
+	if (version > NET_ACCEL_MSG_VERSION) {
+		/* Newer protocol, we must refuse */
+		return -EPROTO;
+	}
+
+	if (version < NET_ACCEL_MSG_VERSION) {
+		/*
+		 * We are newer, so have discretion to accept if we
+		 * wish.  For now however, just reject
+		 */
+		return -EPROTO;
+	}
+
+	return -EINVAL;
+}
+
+
+static int process_rx_msg(struct netback_accel *bend,
+			  struct net_accel_msg *msg)
+{
+	int err = 0;
+		      
+	switch (msg->id) {
+	case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO:
+		/* Reply to a HELLO; mark ourselves as connected */
+		DPRINTK("got Hello reply, version %.8x\n",
+			msg->u.hello.version);
+		
+		/*
+		 * Check that we've not successfully done this
+		 * already.  NB no check at the moment that this reply
+		 * comes after we've actually sent a HELLO as that's
+		 * not possible with the current code structure
+		 */
+		if (bend->hw_state != NETBACK_ACCEL_RES_NONE)
+			return -EPROTO;
+
+		/* Store max_pages for accel_setup */
+		if (msg->u.hello.max_pages > bend->quotas.max_buf_pages) {
+			EPRINTK("More pages than quota allows (%d > %d)\n",
+				msg->u.hello.max_pages, 
+				bend->quotas.max_buf_pages);
+			/* Force it down to the quota */
+			msg->u.hello.max_pages = bend->quotas.max_buf_pages;
+		}
+		bend->max_pages = msg->u.hello.max_pages;
+		
+		/* Set up the hardware visible to the other end */
+		err = bend->accel_setup(bend);
+		if (err) {
+			/* This is fatal */
+			DPRINTK("Hello gave accel_setup error %d\n", err);
+			netback_accel_set_closing(bend);
+		} else {
+			/*
+			 * Now add the context so that packet
+			 * forwarding will commence
+			 */
+			netback_accel_fwd_set_context(bend->mac, bend, 
+						      bend->fwd_priv);
+		}
+		break;
+	case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_ERROR:
+		EPRINTK("got Hello error, versions us:%.8x them:%.8x\n",
+			NET_ACCEL_MSG_VERSION, msg->u.hello.version);
+
+		if (bend->hw_state != NETBACK_ACCEL_RES_NONE)
+			return -EPROTO;
+
+		if (msg->u.hello.version != NET_ACCEL_MSG_VERSION) {
+			/* Error is due to version mismatch */
+			err = check_rx_hello_version(msg->u.hello.version);
+			if (err == 0) {
+				/*
+				 * It's OK to be compatible, send
+				 * another hello with compatible version
+				 */
+				netback_accel_msg_tx_hello
+					(bend, msg->u.hello.version);
+			} else {
+				/*
+				 * Tell frontend that we're not going to
+				 * send another HELLO by going to Closing.
+				 */
+				netback_accel_set_closing(bend);
+			}
+		} 
+		break;
+	case NET_ACCEL_MSG_MAPBUF:
+		VPRINTK("Got mapped buffers request %d\n",
+			msg->u.mapbufs.reqid);
+
+		if (bend->hw_state == NETBACK_ACCEL_RES_NONE)
+			return -EPROTO;
+
+		/*
+		 * Frontend wants a buffer table entry for the
+		 * supplied pages
+		 */
+		err = netback_accel_msg_rx_buffer_map(bend, msg);
+		if (net_accel_msg_reply_notify(bend->shared_page,
+					       bend->msg_channel_irq, 
+					       &bend->to_domU, msg)) {
+			/*
+			 * This is fatal as we can't tell the frontend
+			 * about the problem through the message
+			 * queue, and so would otherwise stalemate
+			 */
+			netback_accel_set_closing(bend);
+		}
+		break;
+	case NET_ACCEL_MSG_FASTPATH:
+		DPRINTK("Got fastpath request\n");
+
+		if (bend->hw_state == NETBACK_ACCEL_RES_NONE)
+			return -EPROTO;
+
+		err = netback_accel_process_fastpath(bend, msg);
+		break;
+	default:
+		EPRINTK("Huh? Message code is %x\n", msg->id);
+		err = -EPROTO;
+		break;
+	}
+	return err;
+}
+
+
+/*  Demultiplex an IRQ from the frontend driver.  */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+void netback_accel_msg_rx_handler(struct work_struct *arg)
+#else
+void netback_accel_msg_rx_handler(void *bend_void)
+#endif
+{
+	struct net_accel_msg msg;
+	int err, queue_was_full = 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+	struct netback_accel *bend = 
+		container_of(arg, struct netback_accel, handle_msg);
+#else
+	struct netback_accel *bend = (struct netback_accel *)bend_void;
+#endif
+
+	mutex_lock(&bend->bend_mutex);
+
+	/*
+	 * This happens when the shared pages have been unmapped, but
+	 * the workqueue not flushed yet
+	 */
+	if (bend->shared_page == NULL)
+		goto done;
+
+	if ((bend->shared_page->aflags &
+	     NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK) != 0) {
+		if (bend->shared_page->aflags &
+		    NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL) {
+			/* We've been told there may now be space. */
+			clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B, 
+				  (unsigned long *)&bend->shared_page->aflags);
+		}
+
+		if (bend->shared_page->aflags &
+		    NET_ACCEL_MSG_AFLAGS_QUEUEUFULL) {
+			clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B, 
+				  (unsigned long *)&bend->shared_page->aflags);
+			queue_was_full = 1;
+		}
+	}
+
+	while ((err = net_accel_msg_recv(bend->shared_page, &bend->from_domU,
+					 &msg)) == 0) {
+		err = process_rx_msg(bend, &msg);
+		
+		if (err != 0) {
+			EPRINTK("%s: Error %d\n", __FUNCTION__, err);
+			goto err;
+		}
+	}
+
+ err:
+	/* There will be space now if we can make any. */
+	if (queue_was_full) 
+		set_queue_not_full(bend);
+ done:
+	mutex_unlock(&bend->bend_mutex);
+
+	return;
+}
diff --git a/drivers/xen/sfc_netback/accel_solarflare.c b/drivers/xen/sfc_netback/accel_solarflare.c
new file mode 100644
index 0000000..f5809a2
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_solarflare.c
@@ -0,0 +1,1292 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include "common.h"
+
+#include "accel.h"
+#include "accel_solarflare.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+
+#include "accel_cuckoo_hash.h"
+
+#include "ci/driver/resource/efx_vi.h"
+
+#include "ci/efrm/nic_table.h" 
+#include "ci/efhw/public.h"
+
+#include <xen/evtchn.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#include "driverlink_api.h"
+
+#define SF_XEN_RX_USR_BUF_SIZE 2048
+
+struct falcon_bend_accel_priv {
+	struct efx_vi_state *efx_vih;
+
+	/*! Array of pointers to dma_map state, used so VNIC can
+	 *  request their removal in a single message
+	 */
+	struct efx_vi_dma_map_state **dma_maps;
+	/*! Index into dma_maps */
+	int dma_maps_index; 
+
+	/*! Serialises access to filters */
+	spinlock_t filter_lock;	     
+	/*! Bitmap of which filters are free */
+	unsigned long free_filters;	 
+	/*! Used for index normalisation */
+	u32 filter_idx_mask;		
+	struct netback_accel_filter_spec *fspecs; 
+	cuckoo_hash_table filter_hash_table;
+
+	u32 txdmaq_gnt;
+	u32 rxdmaq_gnt;
+	u32 doorbell_gnt;
+	u32 evq_rptr_gnt;
+	u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES];
+	u32 evq_npages;
+};
+
+/* Forward declaration */
+static int netback_accel_filter_init(struct netback_accel *);
+static void netback_accel_filter_shutdown(struct netback_accel *);
+
+/**************************************************************************
+ * 
+ * Driverlink stuff
+ *
+ **************************************************************************/
+
+struct driverlink_port {
+	struct list_head link;
+	enum net_accel_hw_type type;
+	struct net_device *net_dev;
+	struct efx_dl_device *efx_dl_dev;
+	void *fwd_priv;
+};
+
+static struct list_head dl_ports;
+
+/* This mutex protects global state, such as the dl_ports list */
+DEFINE_MUTEX(accel_mutex);
+
+static int init_done = 0;
+
+/* The DL callbacks */
+
+
+#if defined(EFX_USE_FASTCALL)
+static enum efx_veto fastcall
+#else
+static enum efx_veto
+#endif
+bend_dl_tx_packet(struct efx_dl_device *efx_dl_dev,
+		  struct sk_buff *skb)
+{
+	struct driverlink_port *port = efx_dl_dev->priv;
+
+	BUG_ON(port == NULL);
+
+	NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++);
+	if (skb_mac_header_was_set(skb))
+		netback_accel_tx_packet(skb, port->fwd_priv);
+	else {
+		DPRINTK("Ignoring packet with missing mac address\n");
+		NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_bad_packets++);
+	}
+	return EFX_ALLOW_PACKET;
+}
+
+/* EFX_USE_FASTCALL */
+#if defined(EFX_USE_FASTCALL)
+static enum efx_veto fastcall
+#else
+static enum efx_veto
+#endif
+bend_dl_rx_packet(struct efx_dl_device *efx_dl_dev,
+		  const char *pkt_buf, int pkt_len)
+{
+	struct driverlink_port *port = efx_dl_dev->priv;
+	struct netback_pkt_buf pkt;
+	struct ethhdr *eh;
+
+	BUG_ON(port == NULL);
+
+	pkt.mac.raw = (char *)pkt_buf;
+	pkt.nh.raw = (char *)pkt_buf + ETH_HLEN;
+	eh = (struct ethhdr *)pkt_buf;
+	pkt.protocol = eh->h_proto;
+
+	NETBACK_ACCEL_STATS_OP(global_stats.dl_rx_packets++);
+	netback_accel_rx_packet(&pkt, port->fwd_priv);
+	return EFX_ALLOW_PACKET;
+}
+
+
+/* Callbacks we'd like to get from the netdriver through driverlink */
+struct efx_dl_callbacks bend_dl_callbacks =
+	{
+		.tx_packet = bend_dl_tx_packet,
+		.rx_packet = bend_dl_rx_packet,
+	};
+
+
+static struct netback_accel_hooks accel_hooks = {
+	THIS_MODULE,
+	&netback_accel_probe,
+	&netback_accel_remove
+};
+
+
+/* Driver link probe - register our callbacks */
+static int bend_dl_probe(struct efx_dl_device *efx_dl_dev,
+			 const struct net_device *net_dev,
+			 const struct efx_dl_device_info *dev_info,
+			 const char* silicon_rev)
+{
+	int rc;
+	enum net_accel_hw_type type;
+	struct driverlink_port *port;
+
+	DPRINTK("%s: %s\n", __FUNCTION__, silicon_rev);
+
+	if (strcmp(silicon_rev, "falcon/a1") == 0)
+		type = NET_ACCEL_MSG_HWTYPE_FALCON_A;
+	else if (strcmp(silicon_rev, "falcon/b0") == 0)
+		type = NET_ACCEL_MSG_HWTYPE_FALCON_B;
+	else if (strcmp(silicon_rev, "siena/a0") == 0)
+		type = NET_ACCEL_MSG_HWTYPE_SIENA_A;
+	else {
+		EPRINTK("%s: unsupported silicon %s\n", __FUNCTION__,
+			silicon_rev);
+		rc = -EINVAL;
+		goto fail1;
+	}
+	
+	port = kmalloc(sizeof(struct driverlink_port), GFP_KERNEL);
+	if (port == NULL) {
+		EPRINTK("%s: no memory for dl probe\n", __FUNCTION__);
+		rc = -ENOMEM;
+		goto fail1;
+	}
+
+	port->efx_dl_dev = efx_dl_dev;
+	efx_dl_dev->priv = port;
+
+	port->fwd_priv = netback_accel_init_fwd_port();
+	if (port->fwd_priv == NULL) {
+		EPRINTK("%s: failed to set up forwarding for port\n",
+			__FUNCTION__);
+		rc = -ENOMEM;
+		goto fail2;
+	}
+
+	rc = efx_dl_register_callbacks(efx_dl_dev, &bend_dl_callbacks);
+	if (rc != 0) {
+		EPRINTK("%s: register_callbacks failed\n", __FUNCTION__);
+		goto fail3;
+	}
+
+	port->type = type;
+	port->net_dev = (struct net_device *)net_dev;
+
+	mutex_lock(&accel_mutex);
+	list_add(&port->link, &dl_ports);
+	mutex_unlock(&accel_mutex);
+
+	rc = netback_connect_accelerator(NETBACK_ACCEL_VERSION, 0,
+					 port->net_dev->name, &accel_hooks);
+
+	if (rc < 0) {
+		EPRINTK("Xen netback accelerator version mismatch\n");
+		goto fail4;
+	} else if (rc > 0) {
+		/*
+		 * In future may want to add backwards compatibility
+		 * and accept certain subsets of previous versions
+		 */
+		EPRINTK("Xen netback accelerator version mismatch\n");
+		goto fail4;
+	} 
+
+	return 0;
+
+ fail4:
+	mutex_lock(&accel_mutex);
+	list_del(&port->link);
+	mutex_unlock(&accel_mutex);
+
+	efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
+ fail3: 
+	netback_accel_shutdown_fwd_port(port->fwd_priv);
+ fail2:
+	efx_dl_dev->priv = NULL;
+	kfree(port);
+ fail1:
+	return rc;
+}
+
+
+static void bend_dl_remove(struct efx_dl_device *efx_dl_dev)
+{
+	struct driverlink_port *port;
+
+	DPRINTK("Unregistering driverlink callbacks.\n");
+
+	mutex_lock(&accel_mutex);
+
+	port = (struct driverlink_port *)efx_dl_dev->priv;
+
+	BUG_ON(list_empty(&dl_ports));
+	BUG_ON(port == NULL);
+	BUG_ON(port->efx_dl_dev != efx_dl_dev);
+
+	netback_disconnect_accelerator(0, port->net_dev->name);
+
+	list_del(&port->link);
+
+	mutex_unlock(&accel_mutex);
+
+	efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
+	netback_accel_shutdown_fwd_port(port->fwd_priv);
+
+	efx_dl_dev->priv = NULL;
+	kfree(port);
+
+	return;
+}
+
+
+static void bend_dl_reset_suspend(struct efx_dl_device *efx_dl_dev)
+{
+	struct driverlink_port *port;
+
+	DPRINTK("Driverlink reset suspend.\n");
+
+	mutex_lock(&accel_mutex);
+
+	port = (struct driverlink_port *)efx_dl_dev->priv;
+	BUG_ON(list_empty(&dl_ports));
+	BUG_ON(port == NULL);
+	BUG_ON(port->efx_dl_dev != efx_dl_dev);
+
+	netback_disconnect_accelerator(0, port->net_dev->name);
+	mutex_unlock(&accel_mutex);
+}
+
+
+static void bend_dl_reset_resume(struct efx_dl_device *efx_dl_dev, int ok)
+{
+	int rc;
+	struct driverlink_port *port;
+
+	DPRINTK("Driverlink reset resume.\n");
+	
+	if (!ok)
+		return;
+
+	port = (struct driverlink_port *)efx_dl_dev->priv;
+	BUG_ON(list_empty(&dl_ports));
+	BUG_ON(port == NULL);
+	BUG_ON(port->efx_dl_dev != efx_dl_dev);
+
+	rc = netback_connect_accelerator(NETBACK_ACCEL_VERSION, 0,
+					 port->net_dev->name, &accel_hooks);
+	if (rc != 0) {
+		EPRINTK("Xen netback accelerator version mismatch\n");
+
+		mutex_lock(&accel_mutex);
+		list_del(&port->link);
+		mutex_unlock(&accel_mutex);
+
+		efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
+
+		netback_accel_shutdown_fwd_port(port->fwd_priv);
+
+		efx_dl_dev->priv = NULL;
+		kfree(port);
+	}
+}
+
+
+static struct efx_dl_driver bend_dl_driver = 
+	{
+		.name = "SFC Xen backend",
+		.probe = bend_dl_probe,
+		.remove = bend_dl_remove,
+		.reset_suspend = bend_dl_reset_suspend,
+		.reset_resume = bend_dl_reset_resume
+	};
+
+
+int netback_accel_sf_init(void)
+{
+	int rc, nic_i;
+	struct efhw_nic *nic;
+
+	INIT_LIST_HEAD(&dl_ports);
+
+	rc = efx_dl_register_driver(&bend_dl_driver);
+	/* If we couldn't find the NET driver, give up */
+	if (rc == -ENOENT)
+		return rc;
+	
+	if (rc == 0) {
+		EFRM_FOR_EACH_NIC(nic_i, nic)
+			falcon_nic_set_rx_usr_buf_size(nic, 
+						       SF_XEN_RX_USR_BUF_SIZE);
+	}
+
+	init_done = (rc == 0);
+	return rc;
+}
+
+
+void netback_accel_sf_shutdown(void)
+{
+	if (!init_done)
+		return;
+	DPRINTK("Unregistering driverlink driver\n");
+
+	/*
+	 * This will trigger removal callbacks for all the devices, which
+	 * will unregister their callbacks, disconnect from netfront, etc.
+	 */
+	efx_dl_unregister_driver(&bend_dl_driver);
+}
+
+
+int netback_accel_sf_hwtype(struct netback_accel *bend)
+{
+	struct driverlink_port *port;
+
+	mutex_lock(&accel_mutex);
+
+	list_for_each_entry(port, &dl_ports, link) {
+		if (strcmp(bend->nicname, port->net_dev->name) == 0) {
+			bend->hw_type = port->type;
+			bend->accel_setup = netback_accel_setup_vnic_hw;
+			bend->accel_shutdown = netback_accel_shutdown_vnic_hw;
+			bend->fwd_priv = port->fwd_priv;
+			bend->net_dev = port->net_dev;
+			mutex_unlock(&accel_mutex);
+			return 0;
+		}
+	}
+
+	mutex_unlock(&accel_mutex);
+
+	EPRINTK("Failed to identify backend device '%s' with a NIC\n",
+		bend->nicname);
+
+	return -ENOENT;
+}
+
+
+/****************************************************************************
+ * Resource management code
+ ***************************************************************************/
+
+static int alloc_page_state(struct netback_accel *bend, int max_pages)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv;
+
+	if (max_pages < 0 || max_pages > bend->quotas.max_buf_pages) {
+		EPRINTK("%s: invalid max_pages: %d\n", __FUNCTION__, max_pages);
+		return -EINVAL;
+	}
+
+	accel_hw_priv = kzalloc(sizeof(struct falcon_bend_accel_priv),
+				GFP_KERNEL);
+	if (accel_hw_priv == NULL) {
+		EPRINTK("%s: no memory for accel_hw_priv\n", __FUNCTION__);
+		return -ENOMEM;
+	}
+
+	accel_hw_priv->dma_maps = kzalloc
+		(sizeof(struct efx_vi_dma_map_state **) * 
+		 (max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ), GFP_KERNEL);
+	if (accel_hw_priv->dma_maps == NULL) {
+		EPRINTK("%s: no memory for dma_maps\n", __FUNCTION__);
+		kfree(accel_hw_priv);
+		return -ENOMEM;
+	}
+
+	bend->buffer_maps = kzalloc(sizeof(struct vm_struct *) * max_pages, 
+				    GFP_KERNEL);
+	if (bend->buffer_maps == NULL) {
+		EPRINTK("%s: no memory for buffer_maps\n", __FUNCTION__);
+		kfree(accel_hw_priv->dma_maps);
+		kfree(accel_hw_priv);
+		return -ENOMEM;
+	}
+
+	bend->buffer_addrs = kzalloc(sizeof(u64) * max_pages, GFP_KERNEL);
+	if (bend->buffer_addrs == NULL) {
+		kfree(bend->buffer_maps);
+		kfree(accel_hw_priv->dma_maps);
+		kfree(accel_hw_priv);
+		return -ENOMEM;
+	}
+
+	bend->accel_hw_priv = accel_hw_priv;
+
+	return 0;
+}
+
+
+static int free_page_state(struct netback_accel *bend)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv;
+
+	DPRINTK("%s: %p\n", __FUNCTION__, bend);
+
+	accel_hw_priv = bend->accel_hw_priv;
+
+	if (accel_hw_priv) {
+		kfree(accel_hw_priv->dma_maps);
+		kfree(bend->buffer_maps);
+		kfree(bend->buffer_addrs);
+		kfree(accel_hw_priv);
+		bend->accel_hw_priv = NULL;
+		bend->max_pages = 0;
+	}
+
+	return 0;
+}
+
+
+/* The timeout event callback for the event q */
+static void bend_evq_timeout(void *context, int is_timeout)
+{
+	struct netback_accel *bend = (struct netback_accel *)context;
+	if (is_timeout) {
+		/* Pass event to vnic front end driver */
+		VPRINTK("timeout event to %d\n", bend->net_channel);
+		NETBACK_ACCEL_STATS_OP(bend->stats.evq_timeouts++);
+		notify_remote_via_irq(bend->net_channel_irq);
+	} else {
+		/* It's a wakeup event, used by Falcon */
+		VPRINTK("wakeup to %d\n", bend->net_channel);
+		NETBACK_ACCEL_STATS_OP(bend->stats.evq_wakeups++);
+		notify_remote_via_irq(bend->net_channel_irq);
+	}
+}
+
+
+/*
+ * Create the eventq and associated gubbins for communication with the
+ * front end vnic driver
+ */
+static int ef_get_vnic(struct netback_accel *bend)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv;
+	int rc = 0;
+
+	BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_NONE);
+
+	/* Allocate page related state and accel_hw_priv */
+	rc = alloc_page_state(bend, bend->max_pages);
+	if (rc != 0) {
+		EPRINTK("Failed to allocate page state: %d\n", rc);
+		return rc;
+	}
+
+	accel_hw_priv = bend->accel_hw_priv;
+
+	rc = efx_vi_alloc(&accel_hw_priv->efx_vih, bend->net_dev->ifindex);
+	if (rc != 0) {
+		EPRINTK("%s: efx_vi_alloc failed %d\n", __FUNCTION__, rc);
+		free_page_state(bend);
+		return rc;
+	}
+
+	rc = efx_vi_eventq_register_callback(accel_hw_priv->efx_vih,
+					     bend_evq_timeout,
+					     bend);
+	if (rc != 0) {
+		EPRINTK("%s: register_callback failed %d\n", __FUNCTION__, rc);
+		efx_vi_free(accel_hw_priv->efx_vih);
+		free_page_state(bend);
+		return rc;
+	}
+
+	bend->hw_state = NETBACK_ACCEL_RES_ALLOC;
+	
+	return 0;
+}
+
+
+static void ef_free_vnic(struct netback_accel *bend)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+
+	BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC);
+
+	efx_vi_eventq_kill_callback(accel_hw_priv->efx_vih);
+
+	DPRINTK("Hardware is freeable. Will proceed.\n");
+
+	efx_vi_free(accel_hw_priv->efx_vih);
+	accel_hw_priv->efx_vih = NULL;
+
+	VPRINTK("Free page state...\n");
+	free_page_state(bend);
+
+	bend->hw_state = NETBACK_ACCEL_RES_NONE;
+}
+
+
+static inline void ungrant_or_crash(grant_ref_t gntref, int domain) {
+	if (net_accel_ungrant_page(gntref) == -EBUSY)
+		net_accel_shutdown_remote(domain);
+}
+
+
+static void netback_accel_release_hwinfo(struct netback_accel *bend)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+	int i;
+
+	DPRINTK("Remove dma q grants %d %d\n", accel_hw_priv->txdmaq_gnt,
+		accel_hw_priv->rxdmaq_gnt);
+	ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);
+	ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
+
+	DPRINTK("Remove doorbell grant %d\n", accel_hw_priv->doorbell_gnt);
+	ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
+
+	if (bend->hw_type == NET_ACCEL_MSG_HWTYPE_FALCON_A) {
+		DPRINTK("Remove rptr grant %d\n", accel_hw_priv->evq_rptr_gnt);
+		ungrant_or_crash(accel_hw_priv->evq_rptr_gnt, bend->far_end);
+	}
+
+	for (i = 0; i < accel_hw_priv->evq_npages; i++) {
+		DPRINTK("Remove evq grant %d\n", accel_hw_priv->evq_mem_gnts[i]);
+		ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], bend->far_end);
+	}
+
+	bend->hw_state = NETBACK_ACCEL_RES_FILTER;
+
+	return;
+}
+
+
+static int ef_bend_hwinfo_falcon_common(struct netback_accel *bend, 
+					struct net_accel_hw_falcon_b *hwinfo)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+	struct efx_vi_hw_resource_metadata res_mdata;
+	struct efx_vi_hw_resource res_array[EFX_VI_HW_RESOURCE_MAXSIZE];
+	int rc, len = EFX_VI_HW_RESOURCE_MAXSIZE, i, pfn = 0;
+	unsigned long txdmaq_pfn = 0, rxdmaq_pfn = 0;
+
+	rc = efx_vi_hw_resource_get_phys(accel_hw_priv->efx_vih, &res_mdata,
+					 res_array, &len);
+	if (rc != 0) {
+		DPRINTK("%s: resource_get_phys returned %d\n",
+			__FUNCTION__, rc);
+		return rc;
+	}
+
+	hwinfo->nic_arch = res_mdata.nic_arch;
+	hwinfo->nic_variant = res_mdata.nic_variant;
+	hwinfo->nic_revision = res_mdata.nic_revision;
+
+	hwinfo->evq_order = res_mdata.evq_order;
+	hwinfo->evq_offs = res_mdata.evq_offs;
+	hwinfo->evq_capacity = res_mdata.evq_capacity;
+	hwinfo->instance = res_mdata.instance;
+	hwinfo->rx_capacity = res_mdata.rx_capacity;
+	hwinfo->tx_capacity = res_mdata.tx_capacity;
+
+	VPRINTK("evq_order %d evq_offs %d evq_cap %d inst %d rx_cap %d tx_cap %d\n",
+		hwinfo->evq_order, hwinfo->evq_offs, hwinfo->evq_capacity,
+		hwinfo->instance, hwinfo->rx_capacity, hwinfo->tx_capacity);
+
+	for (i = 0; i < len; i++) {
+		struct efx_vi_hw_resource *res = &(res_array[i]);
+		switch (res->type) {
+		case EFX_VI_HW_RESOURCE_TXDMAQ:
+			txdmaq_pfn = page_to_pfn(virt_to_page(res->address));
+			break;
+		case EFX_VI_HW_RESOURCE_RXDMAQ: 
+			rxdmaq_pfn = page_to_pfn(virt_to_page(res->address));
+			break;
+		case EFX_VI_HW_RESOURCE_EVQTIMER:
+			break;
+		case EFX_VI_HW_RESOURCE_EVQRPTR:
+		case EFX_VI_HW_RESOURCE_EVQRPTR_OFFSET:
+			hwinfo->evq_rptr = res->address;
+			break;
+		case EFX_VI_HW_RESOURCE_EVQMEMKVA: 
+			accel_hw_priv->evq_npages =  1 << res_mdata.evq_order;
+			pfn = page_to_pfn(virt_to_page(res->address));
+			break;
+		case EFX_VI_HW_RESOURCE_BELLPAGE:
+			hwinfo->doorbell_mfn  = res->address;
+			break;
+		default:
+			EPRINTK("%s: Unknown hardware resource type %d\n",
+				__FUNCTION__, res->type);
+			break;
+		}
+	}
+
+	VPRINTK("Passing txdmaq page pfn %lx\n", txdmaq_pfn);
+	rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(txdmaq_pfn), 0);
+	if (rc < 0)
+		goto fail0;
+	accel_hw_priv->txdmaq_gnt = hwinfo->txdmaq_gnt = rc;
+
+	VPRINTK("Passing rxdmaq page pfn %lx\n", rxdmaq_pfn);
+	rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(rxdmaq_pfn), 0);
+	if (rc < 0)
+		goto fail1;
+	accel_hw_priv->rxdmaq_gnt = hwinfo->rxdmaq_gnt = rc;
+
+	VPRINTK("Passing doorbell page mfn %x\n", hwinfo->doorbell_mfn);
+	/* Make the relevant H/W pages mappable by the far end */
+	rc = net_accel_grant_page(bend->hdev_data, hwinfo->doorbell_mfn, 1);
+	if (rc < 0)
+		goto fail2;
+	accel_hw_priv->doorbell_gnt = hwinfo->doorbell_gnt = rc;
+	
+	/* Now do the same for the memory pages */
+	/* Convert the page + length we got back for the evq to grants. */
+	for (i = 0; i < accel_hw_priv->evq_npages; i++) {
+		rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(pfn), 0);
+		if (rc < 0)
+			goto fail3;
+		accel_hw_priv->evq_mem_gnts[i] = hwinfo->evq_mem_gnts[i] = rc;
+
+		VPRINTK("Got grant %u for evq pfn %x\n", hwinfo->evq_mem_gnts[i], 
+			pfn);
+		pfn++;
+	}
+
+	return 0;
+
+ fail3:
+	for (i = i - 1; i >= 0; i--) {
+		ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], bend->far_end);
+	}
+	ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
+ fail2:
+	ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
+ fail1:
+	ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);	
+ fail0:
+	return rc;
+}
+
+
+static int ef_bend_hwinfo_falcon_a(struct netback_accel *bend, 
+				   struct net_accel_hw_falcon_a *hwinfo)
+{
+	int rc, i;
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+
+	if ((rc = ef_bend_hwinfo_falcon_common(bend, &hwinfo->common)) != 0)
+		return rc;
+
+	/*
+	 * Note that unlike the above, where the message field is the
+	 * page number, here evq_rptr is the entire address because
+	 * it is currently a pointer into the densely mapped timer page.
+	 */
+	VPRINTK("Passing evq_rptr pfn %x for rptr %x\n", 
+		hwinfo->common.evq_rptr >> PAGE_SHIFT,
+		hwinfo->common.evq_rptr);
+	rc = net_accel_grant_page(bend->hdev_data, 
+				  hwinfo->common.evq_rptr >> PAGE_SHIFT, 0);
+	if (rc < 0) {
+		/* Undo ef_bend_hwinfo_falcon_common() */
+		ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);
+		ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
+		ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
+		for (i = 0; i < accel_hw_priv->evq_npages; i++) {
+			ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i],
+					 bend->far_end);
+		}
+		return rc;
+	}
+
+	accel_hw_priv->evq_rptr_gnt = hwinfo->evq_rptr_gnt = rc;
+	VPRINTK("evq_rptr_gnt got %d\n", hwinfo->evq_rptr_gnt);
+	
+	return 0;
+}
+
+
+static int ef_bend_hwinfo_falcon_b(struct netback_accel *bend, 
+				   struct net_accel_hw_falcon_b *hwinfo)
+{
+	return ef_bend_hwinfo_falcon_common(bend, hwinfo);
+}
+
+
+/*
+ * Fill in the message with a description of the hardware resources, based on
+ * the H/W type
+ */
+static int netback_accel_hwinfo(struct netback_accel *bend, 
+				struct net_accel_msg_hw *msgvi)
+{
+	int rc = 0;
+	
+	BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER);
+
+	msgvi->type = bend->hw_type;
+	switch (bend->hw_type) {
+	case NET_ACCEL_MSG_HWTYPE_FALCON_A:
+		rc = ef_bend_hwinfo_falcon_a(bend, &msgvi->resources.falcon_a);
+		break;
+	case NET_ACCEL_MSG_HWTYPE_FALCON_B:
+	case NET_ACCEL_MSG_HWTYPE_SIENA_A:
+		rc = ef_bend_hwinfo_falcon_b(bend, &msgvi->resources.falcon_b);
+		break;
+	case NET_ACCEL_MSG_HWTYPE_NONE:
+		/* Nothing to do. The slow path should just work. */
+		break;
+	}
+
+	if (rc == 0)
+		bend->hw_state = NETBACK_ACCEL_RES_HWINFO;
+		
+	return rc;
+}
+
+
+/* Allocate hardware resources and make them available to the client domain */
+int netback_accel_setup_vnic_hw(struct netback_accel *bend)
+{
+	struct net_accel_msg msg;
+	int err;
+
+	/* Allocate the event queue, VI and so on. */
+	err = ef_get_vnic(bend);
+	if (err) {
+		EPRINTK("Failed to allocate hardware resource for bend:"
+			"error %d\n", err);
+		return err;
+	}
+
+	/* Set up the filter management */
+	err = netback_accel_filter_init(bend);
+	if (err) {
+		EPRINTK("Filter setup failed, error %d", err);
+		ef_free_vnic(bend);
+		return err;
+	}
+
+	net_accel_msg_init(&msg, NET_ACCEL_MSG_SETHW);
+
+	/*
+	 * Extract the low-level hardware info we will actually pass to the
+	 * other end, and set up the grants/ioremap permissions needed
+	 */
+	err = netback_accel_hwinfo(bend, &msg.u.hw);
+
+	if (err != 0) {
+		netback_accel_filter_shutdown(bend);
+		ef_free_vnic(bend);
+		return err;
+	}
+
+	/* Send the message, this is a reply to a hello-reply */
+	err = net_accel_msg_reply_notify(bend->shared_page, 
+					 bend->msg_channel_irq, 
+					 &bend->to_domU, &msg);
+
+	/*
+	 * The message should succeed as it's logically a reply and we
+	 * guarantee space for replies, but a misbehaving frontend
+	 * could result in that behaviour, so be tolerant
+	 */
+	if (err != 0) {
+		netback_accel_release_hwinfo(bend);
+		netback_accel_filter_shutdown(bend);
+		ef_free_vnic(bend);
+	}
+
+	return err;
+}
+
+
+/* Free hardware resources  */
+void netback_accel_shutdown_vnic_hw(struct netback_accel *bend)
+{
+	/*
+	 * Only try and release resources if accel_hw_priv was setup,
+	 * otherwise there is nothing to do as we're on "null-op"
+	 * acceleration
+	 */
+	switch (bend->hw_state) {
+	case NETBACK_ACCEL_RES_HWINFO:
+		VPRINTK("Release hardware resources\n");
+		netback_accel_release_hwinfo(bend);
+		/* deliberate drop through */
+	case NETBACK_ACCEL_RES_FILTER:		
+		VPRINTK("Free filters...\n");
+		netback_accel_filter_shutdown(bend);
+		/* deliberate drop through */
+	case NETBACK_ACCEL_RES_ALLOC:
+		VPRINTK("Free vnic...\n");
+		ef_free_vnic(bend);
+		/* deliberate drop through */
+	case NETBACK_ACCEL_RES_NONE:
+		break;
+	default:
+		BUG();
+	}
+}
+
+/**************************************************************************
+ * 
+ * Buffer table stuff
+ *
+ **************************************************************************/
+
+/*
+ * Undo any allocation that netback_accel_msg_rx_buffer_map() has made
+ * if it fails half way through
+ */
+static inline void buffer_map_cleanup(struct netback_accel *bend, int i)
+{
+	while (i > 0) {
+		i--;
+		bend->buffer_maps_index--;
+		net_accel_unmap_device_page(bend->hdev_data, 
+					    bend->buffer_maps[bend->buffer_maps_index],
+					    bend->buffer_addrs[bend->buffer_maps_index]);
+	}
+}
+
+
+int netback_accel_add_buffers(struct netback_accel *bend, int pages, int log2_pages,
+			      u32 *grants, u32 *buf_addr_out)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+	unsigned long long addr_array[NET_ACCEL_MSG_MAX_PAGE_REQ];
+	int rc, i, index;
+	u64 dev_bus_addr;
+
+	/* Make sure we can't overflow the dma_maps array */
+	if (accel_hw_priv->dma_maps_index >= 
+	    bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ) {
+		EPRINTK("%s: too many buffer table allocations: %d %d\n",
+			__FUNCTION__, accel_hw_priv->dma_maps_index, 
+			bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ);
+		return -EINVAL;
+	}
+
+	/* Make sure we can't overflow the buffer_maps array */
+	if (bend->buffer_maps_index + pages > bend->max_pages) {
+		EPRINTK("%s: too many pages mapped: %d + %d > %d\n", 
+			__FUNCTION__, bend->buffer_maps_index,
+			pages, bend->max_pages);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < pages; i++) {
+		VPRINTK("%s: mapping page %d\n", __FUNCTION__, i);
+		rc = net_accel_map_device_page
+			(bend->hdev_data, grants[i],
+			 &bend->buffer_maps[bend->buffer_maps_index],
+			 &dev_bus_addr);
+    
+		if (rc != 0) {
+			EPRINTK("error in net_accel_map_device_page\n");
+			buffer_map_cleanup(bend, i);
+			return rc;
+		}
+		
+		bend->buffer_addrs[bend->buffer_maps_index] = dev_bus_addr;
+
+		bend->buffer_maps_index++;
+
+		addr_array[i] = dev_bus_addr;
+	}
+
+	VPRINTK("%s: mapping dma addresses to vih %p\n", __FUNCTION__, 
+		accel_hw_priv->efx_vih);
+
+	index = accel_hw_priv->dma_maps_index;
+	if ((rc = efx_vi_dma_map_addrs(accel_hw_priv->efx_vih, addr_array, pages,
+				       &(accel_hw_priv->dma_maps[index]))) < 0) {
+		EPRINTK("error in dma_map_pages\n");
+		buffer_map_cleanup(bend, i);
+		return rc;
+	}
+
+	accel_hw_priv->dma_maps_index++;
+	NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages += pages);
+
+	//DPRINTK("%s: getting map address\n", __FUNCTION__);
+
+	*buf_addr_out = efx_vi_dma_get_map_addr(accel_hw_priv->efx_vih, 
+						accel_hw_priv->dma_maps[index]);
+
+	//DPRINTK("%s: done\n", __FUNCTION__);
+
+	return 0;
+}
+
+
+int netback_accel_remove_buffers(struct netback_accel *bend)
+{
+	/* Only try to free buffers if accel_hw_priv was setup */
+	if (bend->hw_state != NETBACK_ACCEL_RES_NONE) {
+		struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+		int i;
+
+		efx_vi_reset(accel_hw_priv->efx_vih);
+
+		while (accel_hw_priv->dma_maps_index > 0) {
+			accel_hw_priv->dma_maps_index--;
+			i = accel_hw_priv->dma_maps_index;
+			efx_vi_dma_unmap_addrs(accel_hw_priv->efx_vih, 
+					       accel_hw_priv->dma_maps[i]);
+		}
+		
+		while (bend->buffer_maps_index > 0) {
+			VPRINTK("Unmapping granted buffer %d\n", 
+				bend->buffer_maps_index);
+			bend->buffer_maps_index--;
+			i = bend->buffer_maps_index;
+			net_accel_unmap_device_page(bend->hdev_data, 
+						    bend->buffer_maps[i],
+						    bend->buffer_addrs[i]);
+		}
+
+		NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages = 0);
+	}
+
+	return 0;
+}
+
+/**************************************************************************
+ * 
+ * Filter stuff
+ *
+ **************************************************************************/
+
+static int netback_accel_filter_init(struct netback_accel *bend)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+	int i, rc;
+
+	BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC);
+
+	spin_lock_init(&accel_hw_priv->filter_lock);
+
+	if ((rc = cuckoo_hash_init(&accel_hw_priv->filter_hash_table, 
+				   5 /* space for 32 filters */, 8)) != 0) {
+		EPRINTK("Failed to initialise filter hash table\n");
+		return rc;
+	}
+
+	accel_hw_priv->fspecs = kzalloc(sizeof(struct netback_accel_filter_spec) *
+					bend->quotas.max_filters,
+					GFP_KERNEL);
+
+	if (accel_hw_priv->fspecs == NULL) {
+		EPRINTK("No memory for filter specs.\n");
+		cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < bend->quotas.max_filters; i++) {
+		accel_hw_priv->free_filters |= (1 << i);
+	}
+
+	/* Base mask on highest set bit in max_filters  */
+	accel_hw_priv->filter_idx_mask = (1 << fls(bend->quotas.max_filters)) - 1;
+	VPRINTK("filter setup: max is %x mask is %x\n",
+		bend->quotas.max_filters, accel_hw_priv->filter_idx_mask);
+
+	bend->hw_state = NETBACK_ACCEL_RES_FILTER;
+
+	return 0;
+}
+
+
+static inline void make_filter_key(cuckoo_hash_ip_key *key,  
+				   struct netback_accel_filter_spec *filt)
+
+{
+	key->local_ip = filt->destip_be;
+	key->local_port = filt->destport_be;
+	key->proto = filt->proto;
+}
+
+
+static inline 
+void netback_accel_free_filter(struct falcon_bend_accel_priv *accel_hw_priv,
+			       int filter)
+{
+	cuckoo_hash_ip_key filter_key;
+
+	if (!(accel_hw_priv->free_filters & (1 << filter))) {
+		efx_vi_filter_stop(accel_hw_priv->efx_vih, 
+				   accel_hw_priv->fspecs[filter].filter_handle);
+		make_filter_key(&filter_key, &(accel_hw_priv->fspecs[filter]));
+		if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table,
+				       (cuckoo_hash_key *)&filter_key)) {
+			EPRINTK("%s: Couldn't find filter to remove from table\n",
+				__FUNCTION__);
+			BUG();
+		}
+	}
+}
+
+
+static void netback_accel_filter_shutdown(struct netback_accel *bend)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+	int i;
+	unsigned long flags;
+
+	BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER);
+
+	spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
+
+	BUG_ON(accel_hw_priv->fspecs == NULL);
+
+	for (i = 0; i < bend->quotas.max_filters; i++) {
+		netback_accel_free_filter(accel_hw_priv, i);
+	}
+	
+	kfree(accel_hw_priv->fspecs);
+	accel_hw_priv->fspecs = NULL;
+	accel_hw_priv->free_filters = 0;
+	
+	cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table);
+
+	spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
+
+	bend->hw_state = NETBACK_ACCEL_RES_ALLOC;
+}
+
+
+/*! Suggest a filter to replace when we want to insert a new one and have
+ *  none free.
+ */
+static unsigned get_victim_filter(struct netback_accel *bend)
+{
+	/*
+	 * We could attempt to get really clever, and may do at some
+	 * point, but random replacement is v. cheap and low on
+	 * pathological worst cases.
+	 */
+	unsigned index, cycles;
+
+	rdtscl(cycles);
+
+	/*
+	 * Some doubt about the quality of the bottom few bits, so
+	 * throw 'em * away
+	 */
+	index = (cycles >> 4) & ((struct falcon_bend_accel_priv *)
+				 bend->accel_hw_priv)->filter_idx_mask;
+	/*
+	 * We don't enforce that the number of filters is a power of
+	 * two, but the masking gets us to within one subtraction of a
+	 * valid index
+	 */
+	if (index >= bend->quotas.max_filters)
+		index -= bend->quotas.max_filters;
+	DPRINTK("backend %s->%d has no free filters. Filter %d will be evicted\n",
+		bend->nicname, bend->far_end, index);
+	return index;
+}
+
+
+/* Add a filter for the specified IP/port to the backend */
+int 
+netback_accel_filter_check_add(struct netback_accel *bend, 
+			       struct netback_accel_filter_spec *filt)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+	struct netback_accel_filter_spec *fs;
+	unsigned filter_index;
+	unsigned long flags;
+	int rc, recycling = 0;
+	cuckoo_hash_ip_key filter_key, evict_key;
+
+	BUG_ON(filt->proto != IPPROTO_TCP && filt->proto != IPPROTO_UDP);
+
+	DPRINTK("Will add %s filter for dst ip %08x and dst port %d\n", 
+		(filt->proto == IPPROTO_TCP) ? "TCP" : "UDP",
+		be32_to_cpu(filt->destip_be), be16_to_cpu(filt->destport_be));
+
+	spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
+	/*
+	 * Check to see if we're already filtering this IP address and
+	 * port. Happens if you insert a filter mid-stream as there
+	 * are many packets backed up to be delivered to dom0 already
+	 */
+	make_filter_key(&filter_key, filt);
+	if (cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table, 
+			       (cuckoo_hash_key *)(&filter_key), 
+			       &filter_index)) {
+		DPRINTK("Found matching filter %d already in table\n", 
+			filter_index);
+		rc = -1;
+		goto out;
+	}
+
+	if (accel_hw_priv->free_filters == 0) {
+		filter_index = get_victim_filter(bend);
+		recycling = 1;
+	} else {
+		filter_index = __ffs(accel_hw_priv->free_filters);
+		clear_bit(filter_index, &accel_hw_priv->free_filters);
+	}
+
+	fs = &accel_hw_priv->fspecs[filter_index];
+
+	if (recycling) {
+		DPRINTK("Removing filter index %d handle %p\n", filter_index,
+			fs->filter_handle);
+
+		if ((rc = efx_vi_filter_stop(accel_hw_priv->efx_vih, 
+					     fs->filter_handle)) != 0) {
+			EPRINTK("Couldn't clear NIC filter table entry %d\n", rc);
+		}
+
+		make_filter_key(&evict_key, fs);
+		if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table,
+				       (cuckoo_hash_key *)&evict_key)) {
+			EPRINTK("Couldn't find filter to remove from table\n");
+			BUG();
+		}
+		NETBACK_ACCEL_STATS_OP(bend->stats.num_filters--);
+	}
+
+	/* Update the filter spec with new details */
+	*fs = *filt;
+
+	if ((rc = cuckoo_hash_add(&accel_hw_priv->filter_hash_table, 
+				  (cuckoo_hash_key *)&filter_key, filter_index,
+				  1)) != 0) {
+		EPRINTK("Error (%d) adding filter to table\n", rc);
+		accel_hw_priv->free_filters |= (1 << filter_index);
+		goto out;
+	}
+
+	rc = efx_vi_filter(accel_hw_priv->efx_vih, filt->proto, filt->destip_be,
+			   filt->destport_be, 
+			   (struct filter_resource_t **)&fs->filter_handle);
+
+	if (rc != 0) {
+		EPRINTK("Hardware filter insertion failed. Error %d\n", rc);
+		accel_hw_priv->free_filters |= (1 << filter_index);
+		cuckoo_hash_remove(&accel_hw_priv->filter_hash_table, 
+				   (cuckoo_hash_key *)&filter_key);
+		rc = -1;
+		goto out;
+	}
+
+	NETBACK_ACCEL_STATS_OP(bend->stats.num_filters++);
+
+	VPRINTK("%s: success index %d handle %p\n", __FUNCTION__, filter_index, 
+		fs->filter_handle);
+
+	rc = filter_index;
+ out:
+	spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
+	return rc;
+}
+
+
+/* Remove a filter entry for the specific device and IP/port */
+static void netback_accel_filter_remove(struct netback_accel *bend, 
+					int filter_index)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+
+	BUG_ON(accel_hw_priv->free_filters & (1 << filter_index));
+	netback_accel_free_filter(accel_hw_priv, filter_index);
+	accel_hw_priv->free_filters |= (1 << filter_index);
+}
+
+
+/* Remove a filter entry for the specific device and IP/port */
+void netback_accel_filter_remove_spec(struct netback_accel *bend, 
+				      struct netback_accel_filter_spec *filt)
+{
+	struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+	unsigned filter_found;
+	unsigned long flags;
+	cuckoo_hash_ip_key filter_key;
+	struct netback_accel_filter_spec *fs;
+
+	if (filt->proto == IPPROTO_TCP) {
+		DPRINTK("Remove TCP filter for dst ip %08x and dst port %d\n",
+			be32_to_cpu(filt->destip_be),
+			be16_to_cpu(filt->destport_be));
+	} else if (filt->proto == IPPROTO_UDP) {
+		DPRINTK("Remove UDP filter for dst ip %08x and dst port %d\n",
+			be32_to_cpu(filt->destip_be),
+			be16_to_cpu(filt->destport_be));
+	} else {
+		/*
+		 * This could be provoked by an evil frontend, so can't
+		 * BUG(), but harmless as it should fail tests below 
+		 */
+		DPRINTK("Non-TCP/UDP filter dst ip %08x and dst port %d\n",
+			be32_to_cpu(filt->destip_be),
+			be16_to_cpu(filt->destport_be));
+	}
+
+	spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
+
+	make_filter_key(&filter_key, filt);
+	if (!cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table, 
+			       (cuckoo_hash_key *)(&filter_key), 
+			       &filter_found)) {
+		EPRINTK("Couldn't find matching filter already in table\n");
+		goto out;
+	}
+	
+	/* Do a full check to make sure we've not had a hash collision */
+	fs = &accel_hw_priv->fspecs[filter_found];
+	if (fs->destip_be == filt->destip_be &&
+	    fs->destport_be == filt->destport_be &&
+	    fs->proto == filt->proto &&
+	    !memcmp(fs->mac, filt->mac, ETH_ALEN)) {
+		netback_accel_filter_remove(bend, filter_found);
+	} else {
+		EPRINTK("Entry in hash table does not match filter spec\n");
+		goto out;
+	}
+
+ out:
+	spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
+}
diff --git a/drivers/xen/sfc_netback/accel_solarflare.h b/drivers/xen/sfc_netback/accel_solarflare.h
new file mode 100644
index 0000000..84d2146
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_solarflare.h
@@ -0,0 +1,88 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETBACK_ACCEL_SOLARFLARE_H
+#define NETBACK_ACCEL_SOLARFLARE_H
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+
+#include "driverlink_api.h"
+
+#define MAX_NICS 5
+#define MAX_PORTS 2
+
+
+extern int netback_accel_sf_init(void);
+extern void netback_accel_sf_shutdown(void);
+extern int netback_accel_sf_hwtype(struct netback_accel *bend);
+
+extern int netback_accel_sf_char_init(void);
+extern void netback_accel_sf_char_shutdown(void);
+
+extern int netback_accel_setup_vnic_hw(struct netback_accel *bend);
+extern void netback_accel_shutdown_vnic_hw(struct netback_accel *bend);
+
+extern int netback_accel_add_buffers(struct netback_accel *bend, int pages, 
+				     int log2_pages, u32 *grants,
+				     u32 *buf_addr_out);
+extern int netback_accel_remove_buffers(struct netback_accel *bend);
+
+
+/* Add a filter for the specified IP/port to the backend */
+extern int
+netback_accel_filter_check_add(struct netback_accel *bend, 
+			       struct netback_accel_filter_spec *filt);
+/* Remove a filter entry for the specific device and IP/port */
+extern
+void netback_accel_filter_remove_index(struct netback_accel *bend, 
+				       int filter_index);
+extern
+void netback_accel_filter_remove_spec(struct netback_accel *bend, 
+				      struct netback_accel_filter_spec *filt);
+
+/* This is designed to look a bit like a skb */
+struct netback_pkt_buf {
+	union {
+		unsigned char *raw;
+	} mac;
+	union {
+		struct iphdr  *iph;
+		struct arphdr *arph;
+		unsigned char *raw;
+	} nh;
+	int protocol;
+};
+
+/*! \brief Handle a received packet: insert fast path filters as necessary
+ * \param skb The packet buffer
+ */
+extern void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv);
+
+/*! \brief Handle a transmitted packet: update fast path filters as necessary
+ * \param skb The packet buffer
+ */
+extern void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv);
+
+#endif /* NETBACK_ACCEL_SOLARFLARE_H */
diff --git a/drivers/xen/sfc_netback/accel_xenbus.c b/drivers/xen/sfc_netback/accel_xenbus.c
new file mode 100644
index 0000000..4fb82d8
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_xenbus.c
@@ -0,0 +1,831 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <xen/evtchn.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+
+/* drivers/xen/netback/common.h */
+#include "common.h"
+
+#include "accel.h"
+#include "accel_solarflare.h"
+#include "accel_util.h"
+
+#define NODENAME_PATH_FMT "backend/vif/%d/%d"
+
+#define NETBACK_ACCEL_FROM_XENBUS_DEVICE(_dev) (struct netback_accel *) \
+	((struct backend_info *)dev_get_drvdata(&(_dev)->dev))->netback_accel_priv
+
+/* List of all the bends currently in existence. */
+struct netback_accel *bend_list = NULL;
+DEFINE_MUTEX(bend_list_mutex);
+
+/* Put in bend_list.  Must hold bend_list_mutex */
+static void link_bend(struct netback_accel *bend)
+{
+	bend->next_bend = bend_list;
+	bend_list = bend;
+}
+
+/* Remove from bend_list,  Must hold bend_list_mutex */
+static void unlink_bend(struct netback_accel *bend)
+{
+	struct netback_accel *tmp = bend_list;
+	struct netback_accel *prev = NULL;
+	while (tmp != NULL) {
+		if (tmp == bend) {
+			if (prev != NULL)
+				prev->next_bend = bend->next_bend;
+			else
+				bend_list = bend->next_bend;
+			return;
+		}
+		prev = tmp;
+		tmp = tmp->next_bend;
+	}
+}
+
+
+/* Demultiplex a message IRQ from the frontend driver.  */
+static irqreturn_t msgirq_from_frontend(int irq, void *context)
+{
+	struct xenbus_device *dev = context;
+	struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
+	VPRINTK("irq %d from device %s\n", irq, dev->nodename);
+	schedule_work(&bend->handle_msg);
+	return IRQ_HANDLED;
+}
+
+
+/*
+ * Demultiplex an IRQ from the frontend driver.  This is never used
+ * functionally, but we need it to pass to the bind function, and may
+ * get called spuriously
+ */
+static irqreturn_t netirq_from_frontend(int irq, void *context)
+{
+	VPRINTK("netirq %d from device %s\n", irq,
+		((struct xenbus_device *)context)->nodename);
+	
+	return IRQ_HANDLED;
+}
+
+
+/* Read the limits values of the xenbus structure. */
+static 
+void cfg_hw_quotas(struct xenbus_device *dev, struct netback_accel *bend)
+{
+	int err = xenbus_gather
+		(XBT_NIL, dev->nodename,
+		 "limits/max-filters", "%d", &bend->quotas.max_filters,
+		 "limits/max-buf-pages", "%d", &bend->quotas.max_buf_pages,
+		 "limits/max-mcasts", "%d", &bend->quotas.max_mcasts,
+		 NULL);
+	if (err) {
+		/*
+		 * TODO what if they have previously been set by the
+		 * user?  This will overwrite with defaults.  Maybe
+		 * not what we want to do, but useful in startup
+		 * case 
+		 */
+		DPRINTK("Failed to read quotas from xenbus, using defaults\n");
+		bend->quotas.max_filters = NETBACK_ACCEL_DEFAULT_MAX_FILTERS;
+		bend->quotas.max_buf_pages = sfc_netback_max_pages;
+		bend->quotas.max_mcasts = NETBACK_ACCEL_DEFAULT_MAX_MCASTS;
+	}
+
+	return;
+}
+
+
+static void bend_config_accel_change(struct xenbus_watch *watch,
+				     const char **vec, unsigned int len)
+{
+	struct netback_accel *bend;
+
+	bend = container_of(watch, struct netback_accel, config_accel_watch);
+
+	mutex_lock(&bend->bend_mutex);
+	if (bend->config_accel_watch.node != NULL) {
+		struct xenbus_device *dev = 
+			(struct xenbus_device *)bend->hdev_data;
+		DPRINTK("Watch matched, got dev %p otherend %p\n",
+			dev, dev->otherend);
+		if(!xenbus_exists(XBT_NIL, watch->node, "")) {
+			DPRINTK("Ignoring watch as otherend seems invalid\n");
+			goto out;
+		}
+		
+		cfg_hw_quotas(dev, bend);
+	}
+ out:
+	mutex_unlock(&bend->bend_mutex);
+	return;
+}
+
+
+/*
+ * Setup watch on "limits" in the backend vif info to know when
+ * configuration has been set
+ */
+static int setup_config_accel_watch(struct xenbus_device *dev,
+				    struct netback_accel *bend)
+{
+	int err;
+
+	VPRINTK("Setting watch on %s/%s\n", dev->nodename, "limits");
+
+	err = xenbus_watch_path2(dev, dev->nodename, "limits", 
+				 &bend->config_accel_watch, 
+				 bend_config_accel_change);
+
+	if (err) {
+		EPRINTK("%s: Failed to register xenbus watch: %d\n",
+			__FUNCTION__, err);
+		bend->config_accel_watch.node = NULL;
+		return err;
+	}
+	return 0;
+}
+
+
+static int 
+cfg_frontend_info(struct xenbus_device *dev, struct netback_accel *bend,
+		  int *grants)
+{
+	/* Get some info from xenbus on the event channel and shmem grant */
+	int err = xenbus_gather(XBT_NIL, dev->otherend, 
+				"accel-msg-channel", "%u", &bend->msg_channel, 
+				"accel-ctrl-page", "%d", &(grants[0]),
+				"accel-msg-page", "%d", &(grants[1]),
+				"accel-net-channel", "%u", &bend->net_channel,
+				NULL);
+	if (err)
+		EPRINTK("failed to read event channels or shmem grant: %d\n",
+			err);
+	else
+		DPRINTK("got event chan %d and net chan %d from frontend\n",
+			bend->msg_channel, bend->net_channel);
+	return err;
+}
+
+
+/* Setup all the comms needed to chat with the front end driver */
+static int setup_vnic(struct xenbus_device *dev)
+{
+	struct netback_accel *bend;
+	int grants[2], err, msgs_per_queue;
+
+	bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
+
+	err = cfg_frontend_info(dev, bend, grants);
+	if (err)
+		goto fail1;
+
+	/*
+	 * If we get here, both frontend Connected and configuration
+	 * options available.  All is well.
+	 */
+
+	/* Get the hardware quotas for the VNIC in question.  */
+	cfg_hw_quotas(dev, bend);
+
+	/* Set up the deferred work handlers */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+	INIT_WORK(&bend->handle_msg, 
+		  netback_accel_msg_rx_handler);
+#else
+	INIT_WORK(&bend->handle_msg, 
+		  netback_accel_msg_rx_handler,
+		  (void*)bend);
+#endif
+
+	/* Request the frontend mac */
+	err = net_accel_xen_net_read_mac(dev, bend->mac);
+	if (err)
+		goto fail2;
+
+	/* Set up the shared page. */
+	bend->shared_page = net_accel_map_grants_contig(dev, grants, 2, 
+							&bend->sh_pages_unmap);
+
+	if (bend->shared_page == NULL) {
+		EPRINTK("failed to map shared page for %s\n", dev->otherend);
+		err = -ENOMEM;
+		goto fail2;
+	}
+
+	/* Initialise the shared page(s) used for comms */
+	net_accel_msg_init_page(bend->shared_page, PAGE_SIZE, 
+				(bend->net_dev->flags & IFF_UP) && 
+				(netif_carrier_ok(bend->net_dev)));
+
+	msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg);
+
+	net_accel_msg_init_queue
+		(&bend->to_domU, &bend->shared_page->queue0,
+		 (struct net_accel_msg *)((__u8*)bend->shared_page + PAGE_SIZE),
+		 msgs_per_queue);
+
+	net_accel_msg_init_queue
+		(&bend->from_domU, &bend->shared_page->queue1, 
+		 (struct net_accel_msg *)((__u8*)bend->shared_page + 
+					  (3 * PAGE_SIZE / 2)),
+		 msgs_per_queue);
+
+	/* Bind the message event channel to a handler
+	 *
+	 * Note that we will probably get a spurious interrupt when we
+	 * do this, so it must not be done until we have set up
+	 * everything we need to handle it.
+	 */
+	err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id,
+						    bend->msg_channel,
+						    msgirq_from_frontend,
+						    0,
+						    "netback_accel",
+						    dev);
+	if (err < 0) {
+		EPRINTK("failed to bind event channel: %d\n", err);
+		goto fail3;
+	}
+	else
+		bend->msg_channel_irq = err;
+
+	/* TODO: No need to bind this evtchn to an irq. */
+	err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id,
+						    bend->net_channel,
+						    netirq_from_frontend,
+						    0,
+						    "netback_accel",
+						    dev);
+	if (err < 0) {
+		EPRINTK("failed to bind net channel: %d\n", err);
+		goto fail4;
+	}  
+	else
+		bend->net_channel_irq = err;
+
+	/*
+	 * Grab ourselves an entry in the forwarding hash table. We do
+	 * this now so we don't have the embarassmesnt of sorting out
+	 * an allocation failure while at IRQ. Because we pass NULL as
+	 * the context, the actual hash lookup will succeed for this
+	 * NIC, but the check for somewhere to forward to will
+	 * fail. This is necessary to prevent forwarding before
+	 * hardware resources are set up
+	 */
+	err = netback_accel_fwd_add(bend->mac, NULL, bend->fwd_priv);
+	if (err) {
+		EPRINTK("failed to add to fwd hash table\n");
+		goto fail5;
+	}
+
+	/*
+	 * Say hello to frontend.  Important to do this straight after
+	 * obtaining the message queue as otherwise we are vulnerable
+	 * to an evil frontend sending a HELLO-REPLY before we've sent
+	 * the HELLO and confusing us
+	 */
+	netback_accel_msg_tx_hello(bend, NET_ACCEL_MSG_VERSION);
+	return 0;
+
+ fail5:
+	unbind_from_irqhandler(bend->net_channel_irq, dev);
+ fail4:
+	unbind_from_irqhandler(bend->msg_channel_irq, dev);
+ fail3:
+	net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap);
+	bend->shared_page = NULL;
+	bend->sh_pages_unmap = NULL;
+ fail2:
+ fail1:
+	return err;
+}
+
+
+static int read_nicname(struct xenbus_device *dev, struct netback_accel *bend)
+{
+	int len;
+
+	/* nic name used to select interface used for acceleration */
+	bend->nicname = xenbus_read(XBT_NIL, dev->nodename, "accel", &len);
+	if (IS_ERR(bend->nicname))
+		return PTR_ERR(bend->nicname);
+
+	return 0;
+}
+
+static const char *frontend_name = "sfc_netfront";
+
+static int publish_frontend_name(struct xenbus_device *dev)
+{
+	struct xenbus_transaction tr;
+	int err;
+	
+	/* Publish the name of the frontend driver */
+	do {
+		err = xenbus_transaction_start(&tr);
+		if (err != 0) { 
+			EPRINTK("%s: transaction start failed\n", __FUNCTION__);
+			return err;
+		}
+		err = xenbus_printf(tr, dev->nodename, "accel-frontend", 
+				    "%s", frontend_name);
+		if (err != 0) {
+			EPRINTK("%s: xenbus_printf failed\n", __FUNCTION__);
+			xenbus_transaction_end(tr, 1);
+			return err;
+		}
+		err = xenbus_transaction_end(tr, 0);
+	} while (err == -EAGAIN);
+	
+	if (err != 0) {
+		EPRINTK("failed to end frontend name transaction\n");
+		return err;
+	}
+	return 0;
+}
+
+
+static int unpublish_frontend_name(struct xenbus_device *dev)
+{
+	struct xenbus_transaction tr;
+	int err;
+
+	do {
+		err = xenbus_transaction_start(&tr);
+		if (err != 0)
+			break;
+		err = xenbus_rm(tr, dev->nodename, "accel-frontend");
+		if (err != 0) {
+			xenbus_transaction_end(tr, 1);
+			break;
+		}
+		err = xenbus_transaction_end(tr, 0);
+	} while (err == -EAGAIN);
+
+	return err;
+}
+
+
+static void cleanup_vnic(struct netback_accel *bend)
+{
+	struct xenbus_device *dev;
+
+	dev = (struct xenbus_device *)bend->hdev_data;
+
+	DPRINTK("%s: bend %p dev %p\n", __FUNCTION__, bend, dev);
+
+	DPRINTK("%s: Remove %p's mac from fwd table...\n", 
+		__FUNCTION__, bend);
+	netback_accel_fwd_remove(bend->mac, bend->fwd_priv);
+
+	/* Free buffer table allocations */
+	netback_accel_remove_buffers(bend);
+
+	DPRINTK("%s: Release hardware resources...\n", __FUNCTION__);
+	if (bend->accel_shutdown)
+		bend->accel_shutdown(bend);
+
+	if (bend->net_channel_irq) {
+		unbind_from_irqhandler(bend->net_channel_irq, dev);
+		bend->net_channel_irq = 0;
+	}
+
+	if (bend->msg_channel_irq) {
+		unbind_from_irqhandler(bend->msg_channel_irq, dev);
+		bend->msg_channel_irq = 0;
+	}
+
+	if (bend->sh_pages_unmap) {
+		DPRINTK("%s: Unmap grants %p\n", __FUNCTION__, 
+			bend->sh_pages_unmap);
+		net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap);
+		bend->sh_pages_unmap = NULL;
+		bend->shared_page = NULL;
+	}
+}
+
+
+/*************************************************************************/
+
+/*
+ * The following code handles accelstate changes between the frontend
+ * and the backend.  It calls setup_vnic and cleanup_vnic in matching
+ * pairs in response to transitions.
+ *
+ * Valid state transitions for Dom0 are as follows:
+ *
+ * Closed->Init       on probe or in response to Init from domU
+ * Closed->Closing    on error/remove
+ *
+ * Init->Connected    in response to Connected from domU
+ * Init->Closing      on error/remove or in response to Closing from domU
+ *
+ * Connected->Closing on error/remove or in response to Closing from domU
+ *
+ * Closing->Closed    in response to Closed from domU
+ *
+ */
+
+
+static void netback_accel_frontend_changed(struct xenbus_device *dev,
+					   XenbusState frontend_state)
+{
+	struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
+	XenbusState backend_state;
+
+	DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n",
+		__FUNCTION__, xenbus_strstate(bend->frontend_state),
+		xenbus_strstate(frontend_state),dev->nodename, dev->otherend);
+
+	/*
+	 * Ignore duplicate state changes.  This can happen if the
+	 * frontend changes state twice in quick succession and the
+	 * first watch fires in the backend after the second
+	 * transition has completed.
+	 */
+	if (bend->frontend_state == frontend_state)
+		return;
+
+	bend->frontend_state = frontend_state;
+	backend_state = bend->backend_state;
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (backend_state == XenbusStateClosed &&
+		    !bend->removing)
+			backend_state = XenbusStateInitialising;
+		break;
+
+	case XenbusStateConnected:
+		if (backend_state == XenbusStateInitialising) {
+			if (!bend->vnic_is_setup &&
+			    setup_vnic(dev) == 0) {
+				bend->vnic_is_setup = 1;
+				backend_state = XenbusStateConnected;
+			} else {
+				backend_state = XenbusStateClosing;
+			}
+		}
+		break;
+
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+	default:
+		DPRINTK("Unknown state %s (%d) from frontend.\n",
+			xenbus_strstate(frontend_state), frontend_state);
+		/* Unknown state.  Fall through. */
+	case XenbusStateClosing:
+		if (backend_state != XenbusStateClosed)
+			backend_state = XenbusStateClosing;
+
+		/*
+		 * The bend will now persist (with watches active) in
+		 * case the frontend comes back again, eg. after
+		 * frontend module reload or suspend/resume
+		 */
+
+		break;
+
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		if (bend->vnic_is_setup) {
+			bend->vnic_is_setup = 0;
+			cleanup_vnic(bend);
+		}
+
+		if (backend_state == XenbusStateClosing)
+			backend_state = XenbusStateClosed;
+		break;
+	}
+
+	if (backend_state != bend->backend_state) {
+		DPRINTK("Switching from state %s (%d) to %s (%d)\n",
+			xenbus_strstate(bend->backend_state),
+			bend->backend_state,
+			xenbus_strstate(backend_state), backend_state);
+		bend->backend_state = backend_state;
+		net_accel_update_state(dev, backend_state);
+	}
+
+	wake_up(&bend->state_wait_queue);
+}
+
+
+/* accelstate on the frontend's xenbus node has changed */
+static void bend_domu_accel_change(struct xenbus_watch *watch,
+				   const char **vec, unsigned int len)
+{
+	int state;
+	struct netback_accel *bend;
+
+	bend = container_of(watch, struct netback_accel, domu_accel_watch);
+	if (bend->domu_accel_watch.node != NULL) {
+		struct xenbus_device *dev = 
+			(struct xenbus_device *)bend->hdev_data;
+		VPRINTK("Watch matched, got dev %p otherend %p\n",
+			dev, dev->otherend);
+		/*
+		 * dev->otherend != NULL check to protect against
+		 * watch firing when domain goes away and we haven't
+		 * yet cleaned up
+		 */
+		if (!dev->otherend ||
+		    !xenbus_exists(XBT_NIL, watch->node, "") ||
+		    strncmp(dev->otherend, vec[XS_WATCH_PATH],
+			    strlen(dev->otherend))) {
+			DPRINTK("Ignoring watch as otherend seems invalid\n");
+			return;
+		}
+
+		mutex_lock(&bend->bend_mutex);
+
+		xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", 
+			     &state);
+		netback_accel_frontend_changed(dev, state);
+
+		mutex_unlock(&bend->bend_mutex);
+	}
+}
+
+/* Setup watch on frontend's accelstate */
+static int setup_domu_accel_watch(struct xenbus_device *dev,
+				  struct netback_accel *bend)
+{
+	int err;
+
+	VPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate");
+
+	err = xenbus_watch_path2(dev, dev->otherend, "accelstate", 
+				 &bend->domu_accel_watch, 
+				 bend_domu_accel_change);
+	if (err) {
+		EPRINTK("%s: Failed to register xenbus watch: %d\n",
+			__FUNCTION__, err);
+		goto fail;
+	}
+	return 0;
+ fail:
+	bend->domu_accel_watch.node = NULL;
+	return err;
+}
+
+
+int netback_accel_probe(struct xenbus_device *dev)
+{
+	struct netback_accel *bend;
+	struct backend_info *binfo;
+	int err;
+
+	DPRINTK("%s: passed device %s\n", __FUNCTION__, dev->nodename);
+
+	/* Allocate structure to store all our state... */
+	bend = kzalloc(sizeof(struct netback_accel), GFP_KERNEL);
+	if (bend == NULL) {
+		DPRINTK("%s: no memory for bend\n", __FUNCTION__);
+		return -ENOMEM;
+	}
+	
+	mutex_init(&bend->bend_mutex);
+
+	mutex_lock(&bend->bend_mutex);
+
+	/* ...and store it where we can get at it */
+	binfo = dev_get_drvdata(&dev->dev);
+	binfo->netback_accel_priv = bend;
+	/* And vice-versa */
+	bend->hdev_data = dev;
+
+	DPRINTK("%s: Adding bend %p to list\n", __FUNCTION__, bend);
+	
+	init_waitqueue_head(&bend->state_wait_queue);
+	bend->vnic_is_setup = 0;
+	bend->frontend_state = XenbusStateUnknown;
+	bend->backend_state = XenbusStateClosed;
+	bend->removing = 0;
+
+	sscanf(dev->nodename, NODENAME_PATH_FMT, &bend->far_end, 
+	       &bend->vif_num);
+
+	err = read_nicname(dev, bend);
+	if (err) {
+		/*
+		 * Technically not an error, just means we're not 
+		 * supposed to accelerate this
+		 */
+		DPRINTK("failed to get device name\n");
+		goto fail_nicname;
+	}
+
+	/*
+	 * Look up the device name in the list of NICs provided by
+	 * driverlink to get the hardware type.
+	 */
+	err = netback_accel_sf_hwtype(bend);
+	if (err) {
+		/*
+		 * Technically not an error, just means we're not
+		 * supposed to accelerate this, probably belongs to
+		 * some other backend
+		 */
+		DPRINTK("failed to match device name\n");
+		goto fail_init_type;
+	}
+
+	err = publish_frontend_name(dev);
+	if (err)
+		goto fail_publish;
+
+	err = netback_accel_debugfs_create(bend);
+	if (err)
+		goto fail_debugfs;
+	
+	mutex_unlock(&bend->bend_mutex);
+
+	err = setup_config_accel_watch(dev, bend);
+	if (err)
+		goto fail_config_watch;
+
+	err = setup_domu_accel_watch(dev, bend);
+	if (err)
+		goto fail_domu_watch;
+
+	/*
+	 * Indicate to the other end that we're ready to start unless
+	 * the watch has already fired.
+	 */
+	mutex_lock(&bend->bend_mutex);
+	if (bend->backend_state == XenbusStateClosed) {
+		bend->backend_state = XenbusStateInitialising;
+		net_accel_update_state(dev, XenbusStateInitialising);
+	}
+	mutex_unlock(&bend->bend_mutex);
+
+	mutex_lock(&bend_list_mutex);
+	link_bend(bend);
+	mutex_unlock(&bend_list_mutex);
+
+	return 0;
+
+fail_domu_watch:
+
+	unregister_xenbus_watch(&bend->config_accel_watch);
+	kfree(bend->config_accel_watch.node);
+fail_config_watch:
+
+	/*
+	 * Flush the scheduled work queue before freeing bend to get
+	 * rid of any pending netback_accel_msg_rx_handler()
+	 */
+	flush_work_sync(&bend->handle_msg);
+
+	mutex_lock(&bend->bend_mutex);
+	net_accel_update_state(dev, XenbusStateUnknown);
+	netback_accel_debugfs_remove(bend);
+fail_debugfs:
+
+	unpublish_frontend_name(dev);
+fail_publish:
+
+	/* No need to reverse netback_accel_sf_hwtype. */
+fail_init_type:
+
+	kfree(bend->nicname);
+fail_nicname:
+	binfo->netback_accel_priv = NULL;
+	mutex_unlock(&bend->bend_mutex);
+	kfree(bend);
+	return err;
+}
+
+
+int netback_accel_remove(struct xenbus_device *dev)
+{
+	struct backend_info *binfo;
+	struct netback_accel *bend; 
+	int frontend_state;
+
+	binfo = dev_get_drvdata(&dev->dev);
+	bend = (struct netback_accel *) binfo->netback_accel_priv;
+
+	DPRINTK("%s: dev %p bend %p\n", __FUNCTION__, dev, bend);
+	
+	BUG_ON(bend == NULL);
+	
+	mutex_lock(&bend_list_mutex);
+	unlink_bend(bend);
+	mutex_unlock(&bend_list_mutex);
+
+	mutex_lock(&bend->bend_mutex);
+
+	/* Reject any requests to connect. */
+	bend->removing = 1;
+
+	/*
+	 * Switch to closing to tell the other end that we're going
+	 * away.
+	 */
+	if (bend->backend_state != XenbusStateClosing) {
+		bend->backend_state = XenbusStateClosing;
+		net_accel_update_state(dev, XenbusStateClosing);
+	}
+
+	frontend_state = (int)XenbusStateUnknown;
+	xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d",
+		     &frontend_state);
+
+	mutex_unlock(&bend->bend_mutex);
+
+	/*
+	 * Wait until this end goes to the closed state.  This happens
+	 * in response to the other end going to the closed state.
+	 * Don't bother doing this if the other end is already closed
+	 * because if it is then there is nothing to do.
+	 */
+	if (frontend_state != (int)XenbusStateClosed &&
+	    frontend_state != (int)XenbusStateUnknown)
+		wait_event(bend->state_wait_queue,
+			   bend->backend_state == XenbusStateClosed);
+
+	unregister_xenbus_watch(&bend->domu_accel_watch);
+	kfree(bend->domu_accel_watch.node);
+
+	unregister_xenbus_watch(&bend->config_accel_watch);
+	kfree(bend->config_accel_watch.node);
+
+	/*
+	 * Flush the scheduled work queue before freeing bend to get
+	 * rid of any pending netback_accel_msg_rx_handler()
+	 */
+	flush_work_sync(&bend->handle_msg);
+
+	mutex_lock(&bend->bend_mutex);
+
+	/* Tear down the vnic if it was set up. */
+	if (bend->vnic_is_setup) {
+		bend->vnic_is_setup = 0;
+		cleanup_vnic(bend);
+	}
+
+	bend->backend_state = XenbusStateUnknown;
+	net_accel_update_state(dev, XenbusStateUnknown);
+
+	netback_accel_debugfs_remove(bend);
+
+	unpublish_frontend_name(dev);
+
+	kfree(bend->nicname);
+
+	binfo->netback_accel_priv = NULL;
+
+	mutex_unlock(&bend->bend_mutex);
+
+	kfree(bend);
+
+	return 0;
+}
+
+
+void netback_accel_shutdown_bends(void)
+{
+	mutex_lock(&bend_list_mutex);
+	/*
+	 * I think we should have had a remove callback for all
+	 * interfaces before being allowed to unload the module
+	 */
+	BUG_ON(bend_list != NULL);
+	mutex_unlock(&bend_list_mutex);
+}
+
+
+void netback_accel_set_closing(struct netback_accel *bend) 
+{
+
+	bend->backend_state = XenbusStateClosing;
+	net_accel_update_state((struct xenbus_device *)bend->hdev_data,
+			       XenbusStateClosing);
+}
diff --git a/drivers/xen/sfc_netback/ci/compat.h b/drivers/xen/sfc_netback/ci/compat.h
new file mode 100644
index 0000000..79f96f2
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat.h
@@ -0,0 +1,53 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Compatability layer.  Provides definitions of fundamental
+ *          types and definitions that are used throughout CI source
+ *          code.  It does not introduce any link time dependencies,
+ *          or include any unnecessary system headers.
+ */
+/*! \cidoxg_include_ci */
+
+#ifndef __CI_COMPAT_H__
+#define __CI_COMPAT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <ci/compat/primitive.h>
+#include <ci/compat/sysdep.h>
+#include <ci/compat/utils.h>
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __CI_COMPAT_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/gcc.h b/drivers/xen/sfc_netback/ci/compat/gcc.h
new file mode 100644
index 0000000..0cf77c4
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/gcc.h
@@ -0,0 +1,158 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_GCC_H__
+#define __CI_COMPAT_GCC_H__
+
+
+#define CI_HAVE_INT64
+
+
+#if defined(__linux__) && defined(__KERNEL__)
+
+# include <linux/types.h>
+
+typedef __u64                 ci_uint64;
+typedef __s64                 ci_int64;
+# if BITS_PER_LONG == 32
+typedef __s32                 ci_ptr_arith_t;
+typedef __u32                 ci_uintptr_t;
+# else
+typedef __s64                 ci_ptr_arith_t;
+typedef __u64                 ci_uintptr_t;
+# endif
+
+
+/* it's not obvious to me why the below is wrong for x64_64, but
+ * gcc seems to complain on this platform
+ */
+# if defined(__ia64__)
+#  define CI_PRId64            "ld"
+#  define CI_PRIi64            "li"
+#  define CI_PRIo64            "lo"
+#  define CI_PRIu64            "lu"
+#  define CI_PRIx64            "lx"
+#  define CI_PRIX64            "lX"
+# else
+#  define CI_PRId64            "lld"
+#  define CI_PRIi64            "lli"
+#  define CI_PRIo64            "llo"
+#  define CI_PRIu64            "llu"
+#  define CI_PRIx64            "llx"
+#  define CI_PRIX64            "llX"
+# endif
+
+# define CI_PRId32            "d"
+# define CI_PRIi32            "i"
+# define CI_PRIo32            "o"
+# define CI_PRIu32            "u"
+# define CI_PRIx32            "x"
+# define CI_PRIX32            "X"
+
+#else
+
+# include <stdint.h>
+# include <inttypes.h>
+
+typedef uint64_t              ci_uint64;
+typedef int64_t               ci_int64;
+typedef intptr_t              ci_ptr_arith_t;
+typedef uintptr_t             ci_uintptr_t;
+
+# define CI_PRId64            PRId64
+# define CI_PRIi64            PRIi64
+# define CI_PRIo64            PRIo64
+# define CI_PRIu64            PRIu64
+# define CI_PRIx64            PRIx64
+# define CI_PRIX64            PRIX64
+
+# define CI_PRId32            PRId32
+# define CI_PRIi32            PRIi32
+# define CI_PRIo32            PRIo32
+# define CI_PRIu32            PRIu32
+# define CI_PRIx32            PRIx32
+# define CI_PRIX32            PRIX32
+
+#endif
+
+
+typedef ci_uint64                       ci_fixed_descriptor_t;
+
+#define from_fixed_descriptor(desc) ((ci_uintptr_t)(desc))
+#define to_fixed_descriptor(desc) ((ci_fixed_descriptor_t)(ci_uintptr_t)(desc))
+
+
+#if __GNUC__ >= 3 && !defined(__cplusplus)
+/*
+** Checks that [p_mbr] has the same type as [&c_type::mbr_name].
+*/
+# define CI_CONTAINER(c_type, mbr_name, p_mbr)				\
+   __builtin_choose_expr(						\
+     __builtin_types_compatible_p(__typeof__(&((c_type*)0)->mbr_name),	\
+				 __typeof__(p_mbr)),			\
+     __CI_CONTAINER(c_type, mbr_name, p_mbr), (void)0)
+
+# define ci_restrict  __restrict__
+#endif
+
+
+#if !defined(__KERNEL__) || defined(__unix__)
+#define CI_HAVE_NPRINTF  1
+#endif
+
+
+/* At what version was this introduced? */
+#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91)
+# define CI_LIKELY(t)    __builtin_expect((t), 1)
+# define CI_UNLIKELY(t)  __builtin_expect((t), 0)
+#endif
+
+/**********************************************************************
+ * Attributes
+ */
+#if __GNUC__ >= 3 && defined(NDEBUG)
+# define CI_HF __attribute__((visibility("hidden")))
+# define CI_HV __attribute__((visibility("hidden")))
+#else
+# define CI_HF
+# define CI_HV
+#endif
+
+#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
+# define ci_noinline  static __attribute__((__noinline__))
+/* (Linux 2.6 defines its own "noinline", so we use the "__noinline__" form) */
+#else
+# define ci_noinline  static
+#endif
+
+#define CI_ALIGN(x) __attribute__ ((aligned (x)))
+
+#define CI_PRINTF_LIKE(a,b) __attribute__((format(printf,a,b)))
+
+#endif  /* __CI_COMPAT_GCC_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/gcc_x86.h b/drivers/xen/sfc_netback/ci/compat/gcc_x86.h
new file mode 100644
index 0000000..438f0ba
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/gcc_x86.h
@@ -0,0 +1,115 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_GCC_X86_H__
+#define __CI_COMPAT_GCC_X86_H__
+
+/*
+** The facts:
+**
+**   SSE   sfence
+**   SSE2  lfence, mfence, pause
+*/
+
+/* 
+   Barriers to enforce ordering with respect to:
+
+   normal memory use: ci_wmb, ci_rmb, ci_wmb
+   IO bus access use: ci_wiob, ci_riob, ci_iob
+*/
+#if defined(__x86_64__)
+# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
+#else
+# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
+#endif
+
+/* ?? measure the impact of latency of sfence on a modern processor before we
+   take a decision on how to integrate with respect to writecombining */
+
+/* DJR: I don't think we need to add "memory" here.  It means the asm does
+** something to memory that GCC doesn't understand.  But all this does is
+** commit changes that GCC thinks have already happened.  NB. GCC will not
+** reorder across a __volatile__ __asm__ anyway.
+*/
+#define ci_gcc_fence()    __asm__ __volatile__ ("")
+
+#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+# define ci_x86_sfence()  __asm__ __volatile__ ("sfence")
+# define ci_x86_lfence()  __asm__ __volatile__ ("lfence")
+# define ci_x86_mfence()  __asm__ __volatile__ ("mfence")
+#else
+# define ci_x86_sfence()  __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF8")
+# define ci_x86_lfence()  __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xE8")
+# define ci_x86_mfence()  __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF0")
+#endif
+
+
+/* x86 processors to P4 Xeon store in-order unless executing streaming
+   extensions or when using writecombining 
+
+   Hence we do not define ci_wmb to use sfence by default. Requirement is that
+   we do not use writecombining to memory and any code which uses SSE
+   extensions must call sfence directly 
+
+   We need to track non intel clones which may support out of order store.
+
+*/
+
+#if CI_CPU_OOS
+# if CI_CPU_HAS_SSE
+#  define ci_wmb()	ci_x86_sfence()
+# else
+#  define ci_wmb()	ci_x86_mb()
+# endif
+#else
+# define ci_wmb()       ci_gcc_fence()
+#endif
+
+#if CI_CPU_HAS_SSE2
+# define ci_rmb()	ci_x86_lfence()
+# define ci_mb()	ci_x86_mfence()
+# define ci_riob()	ci_x86_lfence()
+# define ci_wiob()	ci_x86_sfence()
+# define ci_iob()	ci_x86_mfence()
+#else
+# if CI_CPU_HAS_SSE
+#  define ci_wiob()	ci_x86_sfence()
+# else
+#  define ci_wiob()	ci_x86_mb()
+# endif
+# define ci_rmb()	ci_x86_mb()
+# define ci_mb()   	ci_x86_mb()
+# define ci_riob()  	ci_x86_mb()
+# define ci_iob()  	ci_x86_mb()
+#endif
+
+typedef unsigned long   ci_phys_addr_t;
+#define ci_phys_addr_fmt  "%lx"
+
+#endif  /* __CI_COMPAT_GCC_X86_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/primitive.h b/drivers/xen/sfc_netback/ci/compat/primitive.h
new file mode 100644
index 0000000..3e58685
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/primitive.h
@@ -0,0 +1,77 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_PRIMITIVE_H__
+#define __CI_COMPAT_PRIMITIVE_H__
+
+
+/**********************************************************************
+ * Primitive types.
+ */
+
+typedef unsigned char                   ci_uint8;
+typedef char                            ci_int8;
+
+typedef unsigned short                  ci_uint16;
+typedef short                           ci_int16;
+
+typedef unsigned int                    ci_uint32;
+typedef int                             ci_int32;
+
+/* 64-bit support is platform dependent. */
+
+
+/**********************************************************************
+ * Other fancy types.
+ */
+
+typedef ci_uint8                        ci_octet;
+
+typedef enum {
+  CI_FALSE = 0,
+  CI_TRUE
+} ci_boolean_t;
+
+
+/**********************************************************************
+ * Some nice types you'd always assumed were standards.
+ * (Really, they are SYSV "standards".)
+ */
+
+#ifdef _WIN32
+typedef unsigned long                   ulong;              
+typedef unsigned int                    uint;
+typedef char*                           caddr_t;
+#elif defined(__linux__) && defined(__KERNEL__)
+#include <linux/types.h>
+#elif defined(__linux__)
+#include <sys/types.h>
+#endif
+
+
+#endif  /* __CI_COMPAT_PRIMITIVE_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/sysdep.h b/drivers/xen/sfc_netback/ci/compat/sysdep.h
new file mode 100644
index 0000000..7f7423c
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/sysdep.h
@@ -0,0 +1,166 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_SYSDEP_H__
+#define __CI_COMPAT_SYSDEP_H__
+
+
+/**********************************************************************
+ * Platform definition fixups.
+ */
+
+#if defined(__ci_ul_driver__) && !defined(__ci_driver__)
+# define __ci_driver__
+#endif
+
+#if defined(__ci_driver__) && !defined(__ci_ul_driver__) && \
+   !defined(__KERNEL__)
+# define __KERNEL__
+#endif
+
+
+/**********************************************************************
+ * Sanity checks (no cheating!)
+ */
+
+#if defined(__KERNEL__) && !defined(__ci_driver__)
+# error Insane.
+#endif
+
+#if defined(__KERNEL__) && defined(__ci_ul_driver__)
+# error Madness.
+#endif
+
+#if defined(__unix__) && defined(_WIN32)
+# error Strange.
+#endif
+
+#if defined(__GNUC__) && defined(_MSC_VER)
+# error Crazy.
+#endif
+
+
+/**********************************************************************
+ * Compiler and processor dependencies.
+ */
+
+#if defined(__GNUC__)
+
+# include <ci/compat/gcc.h>
+
+# if defined(__i386__)
+#  include <ci/compat/x86.h>
+#  include <ci/compat/gcc_x86.h>
+# elif defined(__x86_64__)
+#  include <ci/compat/x86_64.h>
+#  include <ci/compat/gcc_x86.h>
+# elif defined(__PPC__)
+#  include <ci/compat/ppc.h>
+#  include <ci/compat/gcc_ppc.h>
+# elif defined(__ia64__)
+#  include <ci/compat/ia64.h>
+#  include <ci/compat/gcc_ia64.h>
+# else
+#  error Unknown processor - GNU C
+# endif
+
+#elif defined(_MSC_VER)
+
+# include <ci/compat/msvc.h>
+
+# if defined(__i386__)
+#  include <ci/compat/x86.h>
+#  include <ci/compat/msvc_x86.h>
+# elif defined(__x86_64__)
+#  include <ci/compat/x86_64.h>
+#  include <ci/compat/msvc_x86_64.h>
+# else
+#  error Unknown processor MSC
+# endif
+
+#elif defined(__PGI)
+
+# include <ci/compat/x86.h>
+# include <ci/compat/pg_x86.h>
+
+#elif defined(__INTEL_COMPILER)
+
+/* Intel compilers v7 claim to be very gcc compatible. */
+# if __INTEL_COMPILER >= 700
+#  include <ci/compat/gcc.h>
+#  include <ci/compat/x86.h>
+#  include <ci/compat/gcc_x86.h>
+# else
+#  error Old Intel compiler not supported.  Yet.
+# endif
+
+#else
+# error Unknown compiler.
+#endif
+
+
+/**********************************************************************
+ * Misc stuff (that probably shouldn't be here).
+ */
+
+#ifdef __sun
+# ifdef __KERNEL__
+#  define _KERNEL
+#  define _SYSCALL32
+#  ifdef _LP64
+#   define _SYSCALL32_IMPL
+#  endif
+# else
+#  define _REENTRANT
+# endif
+#endif
+
+
+/**********************************************************************
+ * Defaults for anything left undefined.
+ */
+
+#ifndef  CI_LIKELY
+# define CI_LIKELY(t)    (t)
+# define CI_UNLIKELY(t)  (t)
+#endif
+
+#ifndef  ci_restrict
+# define ci_restrict
+#endif
+
+#ifndef  ci_inline
+# define ci_inline  static inline
+#endif
+
+#ifndef  ci_noinline
+# define ci_noinline  static
+#endif
+
+#endif  /* __CI_COMPAT_SYSDEP_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/utils.h b/drivers/xen/sfc_netback/ci/compat/utils.h
new file mode 100644
index 0000000..34d4c99
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/utils.h
@@ -0,0 +1,269 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Handy utility macros.
+ *   \date  2003/01/17
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_UTILS_H__
+#define __CI_COMPAT_UTILS_H__
+
+
+/**********************************************************************
+ * Alignment -- [align] must be a power of 2.
+ **********************************************************************/
+
+  /*! Align forward onto next boundary. */
+
+#define CI_ALIGN_FWD(p, align)               (((p)+(align)-1u) & ~((align)-1u))
+
+
+  /*! Align back onto prev boundary. */
+
+#define CI_ALIGN_BACK(p, align)              ((p) & ~((align)-1u))
+
+
+  /*! How far to next boundary? */
+
+#define CI_ALIGN_NEEDED(p, align, signed_t)  (-(signed_t)(p) & ((align)-1u))
+
+
+  /*! How far beyond prev boundary? */
+
+#define CI_OFFSET(p, align)                  ((p) & ((align)-1u))
+
+
+  /*! Does object fit in gap before next boundary? */
+
+#define CI_FITS(p, size, align, signed_t)			\
+  (CI_ALIGN_NEEDED((p) + 1, (align), signed_t) + 1 >= (size))
+
+
+  /*! Align forward onto next boundary. */
+
+#define CI_PTR_ALIGN_FWD(p, align)					   \
+  ((char*) CI_ALIGN_FWD(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))
+
+  /*! Align back onto prev boundary. */
+
+#define CI_PTR_ALIGN_BACK(p, align)					    \
+  ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))
+
+  /*! How far to next boundary? */
+
+#define CI_PTR_ALIGN_NEEDED(p, align)					\
+  CI_ALIGN_NEEDED(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)),	\
+		  ci_ptr_arith_t)
+
+  /*! How far to next boundary? NZ = not zero i.e. give align if on boundary  */
+
+#define CI_PTR_ALIGN_NEEDED_NZ(p, align)					\
+  ((align) - (((char*)p) -                                                      \
+  ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))))
+
+  /*! How far beyond prev boundary? */
+
+#define CI_PTR_OFFSET(p, align)					\
+  CI_OFFSET(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)))
+
+
+  /* Same as CI_ALIGN_FWD and CI_ALIGN_BACK. */
+
+#define CI_ROUND_UP(i, align)      (((i)+(align)-1u) & ~((align)-1u))
+
+#define CI_ROUND_DOWN(i, align)    ((i) & ~((align)-1u))
+
+
+/**********************************************************************
+ * Byte-order
+ **********************************************************************/
+
+/* These are not flags.  They are enumeration values for use with
+ * CI_MY_BYTE_ORDER. */
+#define CI_BIG_ENDIAN          1
+#define CI_LITTLE_ENDIAN       0
+
+/*
+** Note that these byte-swapping primitives may leave junk in bits above
+** the range they operate on.
+**
+** The CI_BSWAP_nn() routines require that bits above [nn] are zero.  Use
+** CI_BSWAPM_nn(x) if this cannot be guaranteed.
+*/
+
+/* ?? May be able to improve on some of these with inline assembler on some
+** platforms.
+*/
+
+#define CI_BSWAP_16(v)    ((((v) & 0xff) << 8) | ((v) >> 8))
+#define CI_BSWAPM_16(v)   ((((v) & 0xff) << 8) | (((v) & 0xff00) >> 8))
+
+#define CI_BSWAP_32(v)    (((v) >> 24)               | 	\
+			   (((v) & 0x00ff0000) >> 8) |	\
+			   (((v) & 0x0000ff00) << 8) |	\
+			   ((v) << 24))
+#define CI_BSWAPM_32(v)   ((((v) & 0xff000000) >> 24) |	\
+			   (((v) & 0x00ff0000) >> 8)  |	\
+			   (((v) & 0x0000ff00) << 8)  |	\
+			   ((v) << 24))
+
+#define CI_BSWAP_64(v)    (((v) >> 56)                        |	\
+			   (((v) & 0x00ff000000000000) >> 40) |	\
+			   (((v) & 0x0000ff0000000000) >> 24) |	\
+			   (((v) & 0x000000ff00000000) >> 8)  |	\
+			   (((v) & 0x00000000ff000000) << 8)  |	\
+			   (((v) & 0x0000000000ff0000) << 24) |	\
+			   (((v) & 0x000000000000ff00) << 40) |	\
+			   ((v) << 56))
+
+# define CI_BSWAPPED_16_IF(c,v)  ((c) ? CI_BSWAP_16(v) : (v))
+# define CI_BSWAPPED_32_IF(c,v)  ((c) ? CI_BSWAP_32(v) : (v))
+# define CI_BSWAPPED_64_IF(c,v)  ((c) ? CI_BSWAP_64(v) : (v))
+# define CI_BSWAP_16_IF(c,v)     do{ if((c)) (v) = CI_BSWAP_16(v); }while(0)
+# define CI_BSWAP_32_IF(c,v)     do{ if((c)) (v) = CI_BSWAP_32(v); }while(0)
+# define CI_BSWAP_64_IF(c,v)     do{ if((c)) (v) = CI_BSWAP_64(v); }while(0)
+
+#if (CI_MY_BYTE_ORDER == CI_LITTLE_ENDIAN)
+# define CI_BSWAP_LE16(v)    (v)
+# define CI_BSWAP_LE32(v)    (v)
+# define CI_BSWAP_LE64(v)    (v)
+# define CI_BSWAP_BE16(v)    CI_BSWAP_16(v)
+# define CI_BSWAP_BE32(v)    CI_BSWAP_32(v)
+# define CI_BSWAP_BE64(v)    CI_BSWAP_64(v)
+# define CI_BSWAPM_LE16(v)   (v)
+# define CI_BSWAPM_LE32(v)   (v)
+# define CI_BSWAPM_LE64(v)   (v)
+# define CI_BSWAPM_BE16(v)   CI_BSWAPM_16(v)
+# define CI_BSWAPM_BE32(v)   CI_BSWAPM_32(v)
+#elif (CI_MY_BYTE_ORDER == CI_BIG_ENDIAN)
+# define CI_BSWAP_BE16(v)    (v)
+# define CI_BSWAP_BE32(v)    (v)
+# define CI_BSWAP_BE64(v)    (v)
+# define CI_BSWAP_LE16(v)    CI_BSWAP_16(v)
+# define CI_BSWAP_LE32(v)    CI_BSWAP_32(v)
+# define CI_BSWAP_LE64(v)    CI_BSWAP_64(v)
+# define CI_BSWAPM_BE16(v)   (v)
+# define CI_BSWAPM_BE32(v)   (v)
+# define CI_BSWAPM_BE64(v)   (v)
+# define CI_BSWAPM_LE16(v)   CI_BSWAPM_16(v)
+# define CI_BSWAPM_LE32(v)   CI_BSWAPM_32(v)
+#else
+# error Bad endian.
+#endif
+
+
+/**********************************************************************
+ * Get pointer to struct from pointer to member
+ **********************************************************************/
+
+#define CI_MEMBER_OFFSET(c_type, mbr_name)  \
+  ((ci_uint32) (ci_uintptr_t)(&((c_type*)0)->mbr_name))
+
+#define CI_MEMBER_SIZE(c_type, mbr_name)        \
+  sizeof(((c_type*)0)->mbr_name)
+
+#define __CI_CONTAINER(c_type, mbr_name, p_mbr)  \
+  ( (c_type*) ((char*)(p_mbr) - CI_MEMBER_OFFSET(c_type, mbr_name)) )
+
+#ifndef CI_CONTAINER
+# define CI_CONTAINER(t,m,p)  __CI_CONTAINER(t,m,p)
+#endif
+
+
+/**********************************************************************
+ * Structure member initialiser.
+ **********************************************************************/
+
+#ifndef CI_STRUCT_MBR
+# define CI_STRUCT_MBR(name, val)	.name = val
+#endif
+
+
+/**********************************************************************
+ * min / max
+ **********************************************************************/ 
+
+#define CI_MIN(x,y) (((x) < (y)) ? (x) : (y))
+#define CI_MAX(x,y) (((x) > (y)) ? (x) : (y))
+
+/**********************************************************************
+ * abs
+ **********************************************************************/ 
+
+#define CI_ABS(x) (((x) < 0) ? -(x) : (x))
+
+/**********************************************************************
+ * Conditional debugging
+ **********************************************************************/ 
+
+#ifdef NDEBUG
+# define CI_DEBUG(x)
+# define CI_NDEBUG(x)      x
+# define CI_IF_DEBUG(y,n)  (n)
+# define CI_DEBUG_ARG(x)
+#else
+# define CI_DEBUG(x)       x
+# define CI_NDEBUG(x)
+# define CI_IF_DEBUG(y,n)  (y)
+# define CI_DEBUG_ARG(x)   ,x
+#endif
+
+#ifdef __KERNEL__
+#define CI_KERNEL_ARG(x)   ,x
+#else
+#define CI_KERNEL_ARG(x)
+#endif
+
+#ifdef _WIN32
+# define CI_KERNEL_ARG_WIN(x) CI_KERNEL_ARG(x)
+# define CI_ARG_WIN(x) ,x
+#else
+# define CI_KERNEL_ARG_WIN(x)
+# define CI_ARG_WIN(x) 
+#endif
+
+#ifdef __unix__
+# define CI_KERNEL_ARG_UNIX(x) CI_KERNEL_ARG(x)
+# define CI_ARG_UNIX(x) ,x
+#else
+# define CI_KERNEL_ARG_UNIX(x)
+# define CI_ARG_UNIX(x) 
+#endif
+
+#ifdef __linux__
+# define CI_KERNEL_ARG_LINUX(x) CI_KERNEL_ARG(x)
+# define CI_ARG_LINUX(x) ,x
+#else
+# define CI_KERNEL_ARG_LINUX(x)
+# define CI_ARG_LINUX(x) 
+#endif
+
+
+#endif  /* __CI_COMPAT_UTILS_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/x86.h b/drivers/xen/sfc_netback/ci/compat/x86.h
new file mode 100644
index 0000000..2c1dfb3
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/x86.h
@@ -0,0 +1,48 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_X86_H__
+#define __CI_COMPAT_X86_H__
+
+
+#define CI_MY_BYTE_ORDER   CI_LITTLE_ENDIAN
+
+#define CI_WORD_SIZE       4
+#define CI_PTR_SIZE        4
+
+#define CI_PAGE_SIZE       4096
+#define CI_PAGE_SHIFT      12
+#define CI_PAGE_MASK       (~(CI_PAGE_SIZE - 1))
+
+#define CI_CPU_HAS_SSE	   1	/* SSE extensions supported */
+#define CI_CPU_HAS_SSE2	   0	/* SSE2 extensions supported */
+#define CI_CPU_OOS	   0	/* CPU does out of order stores */
+
+
+#endif  /* __CI_COMPAT_X86_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/x86_64.h b/drivers/xen/sfc_netback/ci/compat/x86_64.h
new file mode 100644
index 0000000..c09f540
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/x86_64.h
@@ -0,0 +1,54 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Arch stuff for AMD x86_64.
+ *   \date  2004/08/17
+ */
+
+/*! \cidoxg_include_ci_compat  */
+#ifndef __CI_COMPAT_X86_64_H__
+#define __CI_COMPAT_X86_64_H__
+
+
+#define CI_MY_BYTE_ORDER	CI_LITTLE_ENDIAN
+
+#define CI_WORD_SIZE		8
+#define CI_PTR_SIZE		8
+
+#define CI_PAGE_SIZE		4096
+#define CI_PAGE_SHIFT		12
+#define CI_PAGE_MASK		(~(CI_PAGE_SIZE - 1))
+
+#define CI_CPU_HAS_SSE		1	/* SSE extensions supported */
+
+/* SSE2 disabled while investigating BUG1060 */
+#define CI_CPU_HAS_SSE2		0	/* SSE2 extensions supported */
+#define CI_CPU_OOS		0	/* CPU does out of order stores */
+
+
+#endif  /* __CI_COMPAT_X86_64_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/config.h b/drivers/xen/sfc_netback/ci/tools/config.h
new file mode 100644
index 0000000..fb802f9
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/config.h
@@ -0,0 +1,49 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_CONFIG_H__
+#define __CI_TOOLS_CONFIG_H__
+
+
+/**********************************************************************
+ * Debugging.
+ */
+
+#define CI_INCLUDE_ASSERT_VALID           0
+
+/* Set non-zero to allow info about who has allocated what to appear in
+ * /proc/drivers/level5/mem.
+ * However - Note that doing so can lead to segfault when you unload the
+ * driver, and other weirdness.  i.e. I don't think the code for is quite
+ * right (written by Oktet, hacked by gel), but it does work well enough to be
+ * useful.
+ */
+#define CI_MEMLEAK_DEBUG_ALLOC_TABLE	  0
+
+
+#endif  /* __CI_TOOLS_CONFIG_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/debug.h b/drivers/xen/sfc_netback/ci/tools/debug.h
new file mode 100644
index 0000000..a25c2c4
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/debug.h
@@ -0,0 +1,336 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_DEBUG_H__
+#define __CI_TOOLS_DEBUG_H__
+
+#define CI_LOG_E(x)       x              /* errors      */
+#define CI_LOG_W(x)       x              /* warnings    */
+#define CI_LOG_I(x)       x              /* information */
+#define CI_LOG_V(x)       x              /* verbose     */
+
+/* Build time asserts. We paste the line number into the type name
+ * so that the macro can be used more than once per file even if the
+ * compiler objects to multiple identical typedefs. Collisions
+ * between use in different header files is still possible. */
+#ifndef CI_BUILD_ASSERT
+#define __CI_BUILD_ASSERT_NAME(_x) __CI_BUILD_ASSERT_ILOATHECPP(_x)
+#define __CI_BUILD_ASSERT_ILOATHECPP(_x)  __CI_BUILD_ASSERT__ ##_x
+#define CI_BUILD_ASSERT(e)\
+ typedef char  __CI_BUILD_ASSERT_NAME(__LINE__)[(e)?1:-1]
+#endif
+
+
+#ifdef NDEBUG
+
+# define _ci_check(exp, file, line)
+# define _ci_assert2(e, x, y, file, line)
+# define _ci_assert(exp, file, line)
+# define _ci_assert_equal(exp1, exp2, file, line)
+# define _ci_assert_equiv(exp1, exp2, file, line)
+# define _ci_assert_nequal(exp1, exp2, file, line)
+# define _ci_assert_le(exp1, exp2, file, line)
+# define _ci_assert_lt(exp1, exp2, file, line)
+# define _ci_assert_ge(exp1, exp2, file, line)
+# define _ci_assert_gt(exp1, exp2, file, line)
+# define _ci_assert_impl(exp1, exp2, file, line)
+
+# define _ci_verify(exp, file, line) \
+  do { \
+    (void)(exp); \
+  } while (0)
+
+# define CI_DEBUG_TRY(exp) \
+  do { \
+    (void)(exp); \
+  } while (0)
+
+#define CI_TRACE(exp,fmt)
+#define CI_TRACE_INT(integer)
+#define CI_TRACE_INT32(integer)
+#define CI_TRACE_INT64(integer)
+#define CI_TRACE_UINT(integer)
+#define CI_TRACE_UINT32(integer)
+#define CI_TRACE_UINT64(integer)
+#define CI_TRACE_HEX(integer)
+#define CI_TRACE_HEX32(integer)
+#define CI_TRACE_HEX64(integer)
+#define CI_TRACE_PTR(pointer)
+#define CI_TRACE_STRING(string)
+#define CI_TRACE_MAC(mac)
+#define CI_TRACE_IP(ip_be32)
+#define CI_TRACE_ARP(arp_pkt)
+
+#else
+
+# define _CI_ASSERT_FMT   "\nfrom %s:%d"
+
+# define _ci_check(exp, file, line)                             \
+  do {                                                          \
+    if (CI_UNLIKELY(!(exp)))                                    \
+      ci_warn(("ci_check(%s)"_CI_ASSERT_FMT, #exp,              \
+               (file), (line)));                                \
+  } while (0)
+
+/*
+ * NOTE: ci_fail() emits the file and line where the assert is actually
+ *       coded.
+ */
+
+# define _ci_assert(exp, file, line)                            \
+  do {                                                          \
+    if (CI_UNLIKELY(!(exp)))                                    \
+      ci_fail(("ci_assert(%s)"_CI_ASSERT_FMT, #exp,		\
+               (file), (line)));                                \
+  } while (0)
+
+# define _ci_assert2(e, x, y, file, line)  do {                 \
+    if(CI_UNLIKELY( ! (e) ))                                    \
+      ci_fail(("ci_assert(%s)\nwhere [%s=%"CI_PRIx64"] "        \
+               "[%s=%"CI_PRIx64"]\nat %s:%d\nfrom %s:%d", #e    \
+               , #x, (ci_uint64)(ci_uintptr_t)(x)               \
+               , #y, (ci_uint64)(ci_uintptr_t)(y),              \
+               __FILE__, __LINE__, (file), (line)));            \
+  } while (0)
+
+# define _ci_verify(exp, file, line)                            \
+  do {                                                          \
+    if (CI_UNLIKELY(!(exp)))                                    \
+      ci_fail(("ci_verify(%s)"_CI_ASSERT_FMT, #exp,             \
+               (file), (line)));                                \
+  } while (0)
+
+# define _ci_assert_equal(x, y, f, l)  _ci_assert2((x)==(y), x, y, (f), (l))
+# define _ci_assert_nequal(x, y, f, l) _ci_assert2((x)!=(y), x, y, (f), (l))
+# define _ci_assert_le(x, y, f, l)     _ci_assert2((x)<=(y), x, y, (f), (l))
+# define _ci_assert_lt(x, y, f, l)     _ci_assert2((x)< (y), x, y, (f), (l))
+# define _ci_assert_ge(x, y, f, l)     _ci_assert2((x)>=(y), x, y, (f), (l))
+# define _ci_assert_gt(x, y, f, l)     _ci_assert2((x)> (y), x, y, (f), (l))
+# define _ci_assert_or(x, y, f, l)     _ci_assert2((x)||(y), x, y, (f), (l))
+# define _ci_assert_impl(x, y, f, l)   _ci_assert2(!(x) || (y), x, y, (f), (l))
+# define _ci_assert_equiv(x, y, f, l)  _ci_assert2(!(x)== !(y), x, y, (f), (l))
+
+#define _ci_assert_equal_msg(exp1, exp2, msg, file, line)       \
+  do {                                                          \
+    if (CI_UNLIKELY((exp1)!=(exp2)))                            \
+      ci_fail(("ci_assert_equal_msg(%s == %s) were "            \
+               "(%"CI_PRIx64":%"CI_PRIx64") with msg[%c%c%c%c]" \
+               _CI_ASSERT_FMT, #exp1, #exp2,                    \
+               (ci_uint64)(ci_uintptr_t)(exp1),                 \
+               (ci_uint64)(ci_uintptr_t)(exp2),                 \
+               (((ci_uint32)msg) >> 24) && 0xff,                \
+               (((ci_uint32)msg) >> 16) && 0xff,                \
+               (((ci_uint32)msg) >> 8 ) && 0xff,                \
+               (((ci_uint32)msg)      ) && 0xff,                \
+               (file), (line)));                                \
+  } while (0)
+
+# define CI_DEBUG_TRY(exp)  CI_TRY(exp)
+
+#define CI_TRACE(exp,fmt)						\
+  ci_log("%s:%d:%s] " #exp "=" fmt,                                     \
+         __FILE__, __LINE__, __FUNCTION__, (exp))
+
+
+#define CI_TRACE_INT(integer)						\
+  ci_log("%s:%d:%s] " #integer "=%d",                                   \
+         __FILE__, __LINE__, __FUNCTION__, (integer))
+
+
+#define CI_TRACE_INT32(integer)						\
+  ci_log("%s:%d:%s] " #integer "=%d",                                   \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_int32)integer))
+
+
+#define CI_TRACE_INT64(integer)						\
+  ci_log("%s:%d:%s] " #integer "=%lld",                                 \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_int64)integer))
+
+
+#define CI_TRACE_UINT(integer)						\
+  ci_log("%s:%d:%s] " #integer "=%ud",                                  \
+         __FILE__, __LINE__, __FUNCTION__, (integer))
+
+
+#define CI_TRACE_UINT32(integer)			  	        \
+  ci_log("%s:%d:%s] " #integer "=%ud",                                  \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer))
+
+
+#define CI_TRACE_UINT64(integer)			  	        \
+  ci_log("%s:%d:%s] " #integer "=%ulld",                                \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer))
+
+
+#define CI_TRACE_HEX(integer)						\
+  ci_log("%s:%d:%s] " #integer "=0x%x",                                 \
+         __FILE__, __LINE__, __FUNCTION__, (integer))
+
+
+#define CI_TRACE_HEX32(integer)						\
+  ci_log("%s:%d:%s] " #integer "=0x%x",                                 \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer))
+
+
+#define CI_TRACE_HEX64(integer)						\
+  ci_log("%s:%d:%s] " #integer "=0x%llx",                               \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer))
+
+
+#define CI_TRACE_PTR(pointer)				                \
+  ci_log("%s:%d:%s] " #pointer "=0x%p",                                 \
+         __FILE__, __LINE__, __FUNCTION__, (pointer))
+
+
+#define CI_TRACE_STRING(string)					        \
+  ci_log("%s:%d:%s] " #string "=%s",                                    \
+         __FILE__, __LINE__, __FUNCTION__, (string))
+
+
+#define CI_TRACE_MAC(mac)						\
+  ci_log("%s:%d:%s] " #mac "=" CI_MAC_PRINTF_FORMAT,                    \
+         __FILE__, __LINE__, __FUNCTION__, CI_MAC_PRINTF_ARGS(mac))
+
+
+#define CI_TRACE_IP(ip_be32)						\
+  ci_log("%s:%d:%s] " #ip_be32 "=" CI_IP_PRINTF_FORMAT, __FILE__,       \
+         __LINE__, __FUNCTION__, CI_IP_PRINTF_ARGS(&(ip_be32)))
+
+
+#define CI_TRACE_ARP(arp_pkt)                                           \
+  ci_log("%s:%d:%s]\n"CI_ARP_PRINTF_FORMAT,                             \
+         __FILE__, __LINE__, __FUNCTION__, CI_ARP_PRINTF_ARGS(arp_pkt))
+
+#endif  /* NDEBUG */
+
+#define ci_check(exp) \
+        _ci_check(exp, __FILE__, __LINE__)
+
+#define ci_assert(exp) \
+        _ci_assert(exp, __FILE__, __LINE__)
+
+#define ci_verify(exp) \
+        _ci_verify(exp, __FILE__, __LINE__)
+
+#define ci_assert_equal(exp1, exp2) \
+        _ci_assert_equal(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_equal_msg(exp1, exp2, msg) \
+        _ci_assert_equal_msg(exp1, exp2, msg, __FILE__, __LINE__)
+
+#define ci_assert_nequal(exp1, exp2) \
+        _ci_assert_nequal(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_le(exp1, exp2) \
+        _ci_assert_le(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_lt(exp1, exp2) \
+        _ci_assert_lt(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_ge(exp1, exp2) \
+        _ci_assert_ge(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_gt(exp1, exp2) \
+        _ci_assert_gt(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_impl(exp1, exp2) \
+        _ci_assert_impl(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_equiv(exp1, exp2) \
+        _ci_assert_equiv(exp1, exp2, __FILE__, __LINE__)
+
+
+#define CI_TEST(exp)                            \
+  do{                                           \
+    if( CI_UNLIKELY(!(exp)) )                   \
+      ci_fail(("CI_TEST(%s)", #exp));           \
+  }while(0)
+
+
+#define CI_TRY(exp)				\
+  do{						\
+    int _trc;					\
+    _trc=(exp);					\
+    if( CI_UNLIKELY(_trc < 0) )			\
+      ci_sys_fail(#exp, _trc);			\
+  }while(0)
+
+
+#define CI_TRY_RET(exp)							 \
+  do{									 \
+    int _trc;								 \
+    _trc=(exp);								 \
+    if( CI_UNLIKELY(_trc < 0) ) {					 \
+      ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__); \
+      return _trc;							 \
+    }									 \
+  }while(0)
+
+#define CI_LOGLEVEL_TRY_RET(logfn, exp)                                    \
+  do{									 \
+    int _trc;								 \
+    _trc=(exp);								 \
+    if( CI_UNLIKELY(_trc < 0) ) {					 \
+      logfn (ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__)); \
+      return _trc;							 \
+    }									 \
+  }while(0)
+
+
+#define CI_SOCK_TRY(exp)			\
+  do{						\
+    ci_sock_err_t _trc;				\
+    _trc=(exp);					\
+    if( CI_UNLIKELY(!ci_sock_errok(_trc)) )	\
+      ci_sys_fail(#exp, _trc.val);		\
+  }while(0)
+
+
+#define CI_SOCK_TRY_RET(exp)						     \
+  do{									     \
+    ci_sock_err_t _trc;							     \
+    _trc=(exp);								     \
+    if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) {		  		     \
+      ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \
+      return ci_sock_errcode(_trc);					     \
+    }									     \
+  }while(0)
+
+
+#define CI_SOCK_TRY_SOCK_RET(exp)					     \
+  do{									     \
+    ci_sock_err_t _trc;							     \
+    _trc=(exp);								     \
+    if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) {		  		     \
+      ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \
+      return _trc;							     \
+    }									     \
+  }while(0)
+
+#endif  /* __CI_TOOLS_DEBUG_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/log.h b/drivers/xen/sfc_netback/ci/tools/log.h
new file mode 100644
index 0000000..a9d471d
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/log.h
@@ -0,0 +1,269 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Functions for logging and pretty-printing.
+ *   \date  2002/08/07
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_LOG_H__
+#define __CI_TOOLS_LOG_H__
+
+#include <stdarg.h>
+
+
+/**********************************************************************
+ * Logging.
+ */
+
+/* size of internal log buffer */ 
+#define  CI_LOG_MAX_LINE        512
+/* uses of ci_log must ensure that all trace messages are shorter than this */ 
+#define  CI_LOG_MAX_MSG_LENGTH        (CI_LOG_MAX_LINE-50)
+
+extern void ci_vlog(const char* fmt, va_list args)  CI_HF;
+extern void ci_log(const char* fmt, ...) CI_PRINTF_LIKE(1,2) CI_HF;
+
+  /*! Set the prefix for log messages.
+  **
+  ** Uses the storage pointed to by \em prefix.  Therefore \em prefix must
+  ** be allocated on the heap, or statically.
+  */
+extern void ci_set_log_prefix(const char* prefix)  CI_HF;
+
+typedef void (*ci_log_fn_t)(const char* msg);
+extern ci_log_fn_t  ci_log_fn  CI_HV;
+
+/* Log functions. */
+extern void ci_log_null(const char* msg) CI_HF;
+extern void ci_log_stderr(const char* msg) CI_HF;
+extern void ci_log_stdout(const char* msg) CI_HF;
+extern void ci_log_syslog(const char* msg) CI_HF;
+
+/*! Call the following to install special logging behaviours. */
+extern void ci_log_buffer_till_fail(void) CI_HF;
+extern void ci_log_buffer_till_exit(void) CI_HF;
+
+extern void __ci_log_unique(const char* msg) CI_HF;
+extern ci_log_fn_t __ci_log_unique_fn CI_HV;
+ci_inline void ci_log_uniquify(void) {
+  if( ci_log_fn != __ci_log_unique ) {
+    __ci_log_unique_fn = ci_log_fn;
+    ci_log_fn = __ci_log_unique;
+  }
+}
+
+extern void ci_log_file(const char* msg) CI_HF;
+extern int  ci_log_file_fd CI_HV;
+
+extern void __ci_log_nth(const char* msg) CI_HF;
+extern ci_log_fn_t __ci_log_nth_fn CI_HV;
+extern int  ci_log_nth_n CI_HV;  /* default 100 */
+ci_inline void ci_log_nth(void) {
+  if( ci_log_fn != __ci_log_nth ) {
+    __ci_log_nth_fn = ci_log_fn;
+    ci_log_fn = __ci_log_nth;
+  }
+}
+
+extern int  ci_log_level  CI_HV;
+
+extern int  ci_log_options  CI_HV;
+#define CI_LOG_PID		0x1
+#define CI_LOG_TID		0x2
+#define CI_LOG_TIME		0x4
+#define CI_LOG_DELTA		0x8
+
+/**********************************************************************
+ * Used to define which mode we are in
+ */
+#if (defined(_WIN32) && !defined(__KERNEL__))
+typedef enum {
+  ci_log_md_NULL=0,
+    ci_log_md_ioctl,
+    ci_log_md_stderr,
+    ci_log_md_stdout,
+    ci_log_md_file,
+    ci_log_md_serial,
+    ci_log_md_syslog,
+    ci_log_md_pidfile
+} ci_log_mode_t;
+extern ci_log_mode_t ci_log_mode;
+#endif
+
+/**********************************************************************
+ * Pretty-printing.
+ */
+
+extern char ci_printable_char(char c) CI_HF;
+
+extern void (*ci_hex_dump_formatter)(char* buf, const ci_octet* s,
+				     int i, int off, int len) CI_HV;
+extern void ci_hex_dump_format_octets(char*,const ci_octet*,int,int,int) CI_HF;
+extern void ci_hex_dump_format_dwords(char*,const ci_octet*,int,int,int) CI_HF;
+
+extern void ci_hex_dump_row(char* buf, volatile const void* s, int len,
+			    ci_ptr_arith_t address) CI_HF;
+  /*!< A row contains up to 16 bytes.  Row starts at [address & 15u], so
+  ** therefore [len + (address & 15u)] must be <= 16.
+  */
+
+extern void ci_hex_dump(ci_log_fn_t, volatile const void*,
+			int len, ci_ptr_arith_t address) CI_HF;
+
+extern int  ci_hex_dump_to_raw(const char* src_hex, void* buf,
+			       unsigned* addr_out_opt, int* skip)  CI_HF;
+  /*!< Recovers raw data from a single line of a hex dump.  [buf] must be at
+  ** least 16 bytes long.  Returns the number of bytes written to [buf] (in
+  ** range 1 -> 16), or -1 if [src_hex] doesn't contain hex data.  Does not
+  ** cope with missing bytes at the start of a line.
+  */
+
+extern int ci_format_eth_addr(char* buf, const void* eth_mac_addr,
+			      char sep)  CI_HF;
+  /*!< This will write 18 characters to <buf> including terminating null.
+  ** Returns number of bytes written excluding null.  If [sep] is zero, ':'
+  ** is used.
+  */
+
+extern int ci_parse_eth_addr(void* eth_mac_addr,
+			     const char* str, char sep) CI_HF;
+  /*!< If [sep] is zero, absolutely any separator is accepted (even
+  ** inconsistent separators).  Returns 0 on success, -1 on error.
+  */
+
+extern int ci_format_ip4_addr(char* buf, unsigned addr_be32) CI_HF;
+  /*!< Formats the IP address (in network endian) in dotted-quad.  Returns
+  ** the number of bytes written (up to 15), excluding the null.  [buf]
+  ** must be at least 16 bytes long.
+  */
+
+#if defined(__unix__) && ! defined(__KERNEL__)
+extern int ci_format_select_set(char* s, int len_s, int nfds, const fd_set*);
+extern int ci_format_select(char* s, int len_s,
+			    int nfds, const fd_set* rds, const fd_set* wrs,
+			    const fd_set* exs, struct timeval* timeout);
+#endif
+
+
+/**********************************************************************
+ * Error checking.
+ */
+
+extern void (*ci_fail_stop_fn)(void) CI_HV;
+
+extern void ci_fail_stop(void) CI_HF;
+extern void ci_fail_hang(void) CI_HF;
+extern void ci_fail_bomb(void) CI_HF;
+extern void ci_backtrace(void) CI_HF;
+
+#if defined __linux__ && !defined __KERNEL__
+extern void ci_fail_abort (void) CI_HF;
+#endif
+
+#ifdef __GNUC__
+extern void
+__ci_fail(const char*, ...) CI_PRINTF_LIKE(1,2) CI_HF;
+#else
+# if _PREFAST_
+  extern void _declspec(noreturn) __ci_fail(const char* fmt, ...);
+# else 
+  extern void __ci_fail(const char* fmt, ...);
+# endif
+
+#endif
+
+#define ci_warn(x)							   \
+  do{ ci_log("WARN at %s:%d", __FILE__, __LINE__); }while(0)
+
+#define ci_fail(x)							   \
+  do{ ci_log("FAIL at %s:%d", __FILE__, __LINE__);  __ci_fail x; }while(0)
+
+extern void __ci_sys_fail(const char* fn, int rc,
+			  const char* file, int line) CI_HF;
+#define ci_sys_fail(fn, rc)  __ci_sys_fail(fn, rc, __FILE__, __LINE__)
+
+/**********************************************************************
+ * Logging to buffer (src/citools/log_buffer.c)
+ */
+
+/*! Divert ci_log() messages to the log buffer
+ *  normally they go to the  system console */
+extern void ci_log_buffer_till_fail(void) CI_HF;
+
+/*! Dump the contents of the log buffer to the system console */
+extern void ci_log_buffer_dump(void) CI_HF;
+
+
+/**********************************************************************
+ * Some useful pretty-printing.
+ */
+
+#ifdef  __linux__
+# define CI_SOCKCALL_FLAGS_FMT	"%s%s%s%s%s%s%s%s%s%s%s"
+
+# define CI_SOCKCALL_FLAGS_PRI_ARG(x)		\
+  (((x) & MSG_OOB         ) ? "OOB "         :""),	\
+  (((x) & MSG_PEEK        ) ? "PEEK "        :""),	\
+  (((x) & MSG_DONTROUTE   ) ? "DONTROUTE "   :""),	\
+  (((x) & MSG_EOR         ) ? "EOR "         :""),	\
+  (((x) & MSG_CTRUNC      ) ? "CTRUNC "      :""),	\
+  (((x) & MSG_TRUNC       ) ? "TRUNC "       :""),	\
+  (((x) & MSG_WAITALL     ) ? "WAITALL "     :""),	\
+  (((x) & MSG_DONTWAIT    ) ? "DONTWAIT "    :""),	\
+  (((x) & MSG_NOSIGNAL    ) ? "NOSIGNAL "    :""),	\
+  (((x) & MSG_ERRQUEUE    ) ? "ERRQUEUE "    :""),	\
+  (((x) & MSG_CONFIRM     ) ? "CONFIRM "     :"")
+#endif
+
+#ifdef  _WIN32
+# define CI_SOCKCALL_FLAGS_FMT	"%s%s%s"
+
+# define CI_SOCKCALL_FLAGS_PRI_ARG(x)		\
+  (((x) & MSG_OOB         ) ? "OOB "         :""),	\
+  (((x) & MSG_PEEK        ) ? "PEEK "        :""),	\
+  (((x) & MSG_DONTROUTE   ) ? "DONTROUTE "   :"")
+#endif
+
+#ifdef  __sun__
+# define CI_SOCKCALL_FLAGS_FMT	"%s%s%s%s%s%s%s%s%s"
+
+# define CI_SOCKCALL_FLAGS_PRI_ARG(x)		\
+  (((x) & MSG_OOB         ) ? "OOB "         :""),	\
+  (((x) & MSG_PEEK        ) ? "PEEK "        :""),	\
+  (((x) & MSG_DONTROUTE   ) ? "DONTROUTE "   :""),	\
+  (((x) & MSG_EOR         ) ? "EOR "         :""),	\
+  (((x) & MSG_CTRUNC      ) ? "CTRUNC "      :""),	\
+  (((x) & MSG_TRUNC       ) ? "TRUNC "       :""),	\
+  (((x) & MSG_WAITALL     ) ? "WAITALL "     :""),	\
+  (((x) & MSG_DONTWAIT    ) ? "DONTWAIT "    :""),	\
+  (((x) & MSG_NOTIFICATION) ? "NOTIFICATION" :"")
+#endif
+
+#endif  /* __CI_TOOLS_LOG_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h b/drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h
new file mode 100644
index 0000000..33af3f1
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h
@@ -0,0 +1,370 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools_platform  */
+
+#ifndef __CI_TOOLS_GCC_X86_H__
+#define __CI_TOOLS_GCC_X86_H__
+
+
+/**********************************************************************
+ * Free-running cycle counters.
+ */
+
+#define CI_HAVE_FRC64
+#define CI_HAVE_FRC32
+
+#define ci_frc32(pval)  __asm__ __volatile__("rdtsc" : "=a" (*pval) : : "edx")
+
+#if defined(__x86_64__)
+ci_inline void ci_frc64(ci_uint64* pval) {
+  /* temp fix until we figure how to get this out in one bite */	   
+  ci_uint64 low, high;
+  __asm__ __volatile__("rdtsc" : "=a" (low) , "=d" (high));	 	
+  *pval = (high << 32) | low;
+}
+
+#else
+#define ci_frc64(pval)  __asm__ __volatile__("rdtsc" : "=A" (*pval))
+#endif
+
+#define ci_frc_flush()  /* ?? Need a pipeline barrier. */
+
+
+/**********************************************************************
+ * Atomic integer.
+ */
+
+/*
+** int  ci_atomic_read(a)         { return a->n;        }
+** void ci_atomic_set(a, v)       { a->n = v;           }
+** void ci_atomic_inc(a)          { ++a->n;             }
+** void ci_atomic_dec(a)          { --a->n;             }
+** int  ci_atomic_inc_and_test(a) { return ++a->n == 0; }
+** int  ci_atomic_dec_and_test(a) { return --a->n == 0; }
+** void ci_atomic_and(a, v)       { a->n &= v;          }
+** void ci_atomic_or(a, v)        { a->n |= v;          }
+*/
+
+typedef struct { volatile ci_int32 n; } ci_atomic_t;
+
+#define CI_ATOMIC_INITIALISER(i)  {(i)}
+
+static inline ci_int32  ci_atomic_read(const ci_atomic_t* a) { return a->n; }
+static inline void ci_atomic_set(ci_atomic_t* a, int v) { a->n = v; ci_wmb();   }
+
+static inline void ci_atomic_inc(ci_atomic_t* a)
+{ __asm__ __volatile__("lock; incl %0" : "+m" (a->n)); }
+
+ 
+static inline void ci_atomic_dec(ci_atomic_t* a)
+{ __asm__ __volatile__("lock; decl %0" : "+m" (a->n)); }
+
+static inline int ci_atomic_inc_and_test(ci_atomic_t* a) {
+  char r;
+  __asm__ __volatile__("lock; incl %0; sete %1"
+		       : "+m" (a->n), "=qm" (r));
+  return r;
+}
+
+static inline int ci_atomic_dec_and_test(ci_atomic_t* a) {
+  char r;
+  __asm__ __volatile__("lock; decl %0; sete %1"
+		       : "+m" (a->n), "=qm" (r));
+  return r;
+}
+
+ci_inline int
+ci_atomic_xadd (ci_atomic_t *a, int v) {
+   __asm__ ("lock xadd %0, %1" : "=r" (v), "+m" (a->n) : "0" (v));
+  return v;
+}
+ci_inline int
+ci_atomic_xchg (ci_atomic_t *a, int v) {
+   __asm__ ("lock xchg %0, %1" : "=r" (v), "+m" (a->n) : "0" (v));
+  return v;
+}
+
+ci_inline void ci_atomic32_or(volatile ci_uint32* p, ci_uint32 mask)
+{ __asm__ __volatile__("lock; orl %1, %0" : "+m" (*p) : "ir" (mask)); }
+
+ci_inline void ci_atomic32_and(volatile ci_uint32* p, ci_uint32 mask)
+{ __asm__ __volatile__("lock; andl %1, %0" : "+m" (*p) : "ir" (mask)); }
+
+ci_inline void ci_atomic32_add(volatile ci_uint32* p, ci_uint32 v)
+{ __asm__ __volatile__("lock; addl %1, %0" : "+m" (*p) : "ir" (v)); }
+
+ci_inline void ci_atomic32_inc(volatile ci_uint32* p)
+{ __asm__ __volatile__("lock; incl %0" : "+m" (*p)); }
+
+ci_inline int ci_atomic32_dec_and_test(volatile ci_uint32* p) {
+  char r;
+  __asm__ __volatile__("lock; decl %0; sete %1" : "+m" (*p), "=qm" (r));
+  return r;
+}
+
+#define ci_atomic_or(a, v)   ci_atomic32_or ((ci_uint32*) &(a)->n, (v))
+#define ci_atomic_and(a, v)  ci_atomic32_and((ci_uint32*) &(a)->n, (v))
+#define ci_atomic_add(a, v)  ci_atomic32_add((ci_uint32*) &(a)->n, (v))
+
+extern int ci_glibc_uses_nptl (void) CI_HF;
+extern int ci_glibc_nptl_broken(void) CI_HF;
+extern int ci_glibc_gs_get_is_multihreaded_offset (void) CI_HF;
+extern int ci_glibc_gs_is_multihreaded_offset CI_HV;
+
+#if !defined(__x86_64__)
+#ifdef __GLIBC__
+/* Returns non-zero if the calling process might be mulithreaded, returns 0 if
+ * it definitely isn't (i.e. if reimplementing this function for other
+ * architectures and platforms, you can safely just return 1).
+ */
+static inline int ci_is_multithreaded (void) {
+
+  while (1) {
+    if (ci_glibc_gs_is_multihreaded_offset >= 0) {
+      /* NPTL keeps a variable that tells us this hanging off gs (i.e. in thread-
+       * local storage); just return this
+       */
+      int r;
+      __asm__ __volatile__ ("movl %%gs:(%1), %0"
+                            : "=r" (r)
+                            : "r" (ci_glibc_gs_is_multihreaded_offset));
+      return r;
+    }
+
+    if (ci_glibc_gs_is_multihreaded_offset == -2) {
+      /* This means we've already determined that the libc version is NOT good
+       * for our funky "is multithreaded" hack
+       */
+      return 1;
+    }
+
+    /* If we get here, it means this is the first time the function has been
+     * called -- detect the libc version and go around again.
+     */
+    ci_glibc_gs_is_multihreaded_offset = ci_glibc_gs_get_is_multihreaded_offset ();
+
+    /* Go around again.  We do the test here rather than at the top so that we go
+     * quicker in the common the case
+     */
+  }
+}
+
+#else    /* def __GLIBC__ */
+
+#define ci_is_multithreaded() 1 /* ?? Is the the POSIX way of finding out */
+                                /*    whether the appication is single */
+                                /*    threaded? */
+
+#endif   /* def __GLIBC__ */
+
+#else    /* defined __x86_64__ */
+
+static inline int ci_is_multithreaded (void) {
+  /* Now easy way to tell on x86_64; so assume we're multithreaded */
+  return 1;
+}
+
+#endif    /* defined __x86_64__ */
+
+
+/**********************************************************************
+ * Compare and swap.
+ */
+
+#define CI_HAVE_COMPARE_AND_SWAP
+
+ci_inline int ci_cas32_succeed(volatile ci_int32* p, ci_int32 oldval,
+                               ci_int32 newval) {
+  char ret;
+  ci_int32 prevval;
+  __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0"
+		       : "=q"(ret), "+m"(*p), "=a"(prevval)
+		       : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas32_fail(volatile ci_int32* p, ci_int32 oldval,
+                            ci_int32 newval) {
+  char ret;
+  ci_int32 prevval;
+  __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0"
+		       : "=q"(ret), "+m"(*p), "=a"(prevval)
+		       : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+#ifdef __x86_64__
+ci_inline int ci_cas64_succeed(volatile ci_int64* p, ci_int64 oldval,
+			       ci_int64 newval) {
+  char ret;
+  ci_int64 prevval;
+  __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0"
+		       : "=q"(ret), "+m"(*p), "=a"(prevval)
+		       : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas64_fail(volatile ci_int64* p, ci_int64 oldval,
+			    ci_int64 newval) {
+  char ret;
+  ci_int64 prevval;
+  __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0"
+		       : "=q"(ret), "+m"(*p), "=a"(prevval)
+		       : "r"(newval), "a"(oldval));
+  return ret;
+}
+#endif
+
+ci_inline int ci_cas32u_succeed(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) {
+  char ret;
+  ci_uint32 prevval;
+  __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0"
+		       : "=q"(ret), "+m"(*p), "=a"(prevval)
+		       : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas32u_fail(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) {
+  char ret;
+  ci_uint32 prevval;
+  __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0"
+		       : "=q"(ret), "+m"(*p), "=a"(prevval)
+		       : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas64u_succeed(volatile ci_uint64* p, ci_uint64 oldval,
+			       ci_uint64 newval) {
+  char ret;
+  ci_uint64 prevval;
+  __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0"
+		       : "=q"(ret), "+m"(*p), "=a"(prevval)
+		       : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas64u_fail(volatile ci_uint64* p, ci_uint64 oldval,
+			    ci_uint64 newval) {
+  char ret;
+  ci_uint64 prevval;
+  __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0"
+		       : "=q"(ret), "+m"(*p), "=a"(prevval)
+		       : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+#ifdef __x86_64__
+
+# define ci_cas_uintptr_succeed(p,o,n)				\
+    ci_cas64u_succeed((volatile ci_uint64*) (p), (o), (n))
+# define ci_cas_uintptr_fail(p,o,n)				\
+    ci_cas64u_fail((volatile ci_uint64*) (p), (o), (n))
+
+#else
+
+# define ci_cas_uintptr_succeed(p,o,n)				\
+    ci_cas32u_succeed((volatile ci_uint32*) (p), (o), (n))
+# define ci_cas_uintptr_fail(p,o,n)				\
+    ci_cas32u_fail((volatile ci_uint32*) (p), (o), (n))
+
+#endif
+
+
+/**********************************************************************
+ * Atomic bit field.
+ */
+
+typedef ci_uint32  ci_bits;
+#define CI_BITS_N			32u
+
+#define CI_BITS_DECLARE(name, n)			\
+  ci_bits name[((n) + CI_BITS_N - 1u) / CI_BITS_N]
+
+ci_inline void ci_bits_clear_all(volatile ci_bits* b, int n_bits)
+{ memset((void*) b, 0, (n_bits+CI_BITS_N-1u) / CI_BITS_N * sizeof(ci_bits)); }
+
+ci_inline void ci_bit_set(volatile ci_bits* b, int i) {
+  __asm__ __volatile__("lock; btsl %1, %0"
+		       : "=m" (*b)
+		       : "Ir" (i));
+}
+
+ci_inline void ci_bit_clear(volatile ci_bits* b, int i) {
+  __asm__ __volatile__("lock; btrl %1, %0"
+		       : "=m" (*b)
+		       : "Ir" (i));
+}
+
+ci_inline int  ci_bit_test(volatile ci_bits* b, int i) {
+  char rc;
+  __asm__("btl %2, %1; setc %0"
+	  : "=r" (rc)
+	  : "m" (*b), "Ir" (i));
+  return rc;
+}
+
+ci_inline int ci_bit_test_and_set(volatile ci_bits* b, int i) {
+  char rc;
+  __asm__ __volatile__("lock; btsl %2, %1; setc %0"
+		       : "=r" (rc), "+m" (*b)
+		       : "Ir" (i));
+  return rc;
+}
+
+ci_inline int ci_bit_test_and_clear(volatile ci_bits* b, int i) {
+  char rc;
+  __asm__ __volatile__("lock; btrl %2, %1; setc %0"
+		       : "=r" (rc), "+m" (*b)
+		       : "Ir" (i));
+  return rc;
+}
+
+/* These mask ops only work within a single ci_bits word. */
+#define ci_bit_mask_set(b,m)	ci_atomic32_or((b), (m))
+#define ci_bit_mask_clear(b,m)	ci_atomic32_and((b), ~(m))
+
+
+/**********************************************************************
+ * Misc.
+ */
+
+#if __GNUC__ >= 3
+# define ci_spinloop_pause()  __asm__("pause") 
+#else
+# define ci_spinloop_pause()  __asm__(".byte 0xf3, 0x90")
+#endif
+
+
+#define CI_HAVE_ADDC32
+#define ci_add_carry32(sum, v)  __asm__("addl %1, %0 ;"			  \
+					"adcl $0, %0 ;"			  \
+					: "=r" (sum)			  \
+					: "g" ((ci_uint32) v), "0" (sum))
+
+
+#endif  /* __CI_TOOLS_GCC_X86_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h b/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h
new file mode 100644
index 0000000..e0870b6
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h
@@ -0,0 +1,361 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+
+/*! \cidoxg_include_ci_tools_platform  */
+
+#ifndef __CI_TOOLS_LINUX_KERNEL_H__
+#define __CI_TOOLS_LINUX_KERNEL_H__
+
+/**********************************************************************
+ * Need to know the kernel version.
+ */
+
+#ifndef LINUX_VERSION_CODE
+# include <linux/version.h>
+# ifndef UTS_RELEASE
+   /* 2.6.18 onwards defines UTS_RELEASE in a separate header */
+#  include <linux/utsrelease.h>
+# endif
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) || \
+    LINUX_VERSION_CODE >= KERNEL_VERSION(2,7,0)
+# error "Linux 2.6 required"
+#endif
+
+
+#include <linux/slab.h>     /* kmalloc / kfree */
+#include <linux/vmalloc.h>  /* vmalloc / vfree */
+#include <linux/interrupt.h>/* in_interrupt()  */
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/highmem.h>
+#include <linux/ctype.h>
+#include <linux/uio.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/kmap_types.h>
+#include <asm/semaphore.h>
+
+#include <ci/tools/config.h>
+
+#define ci_in_irq        in_irq
+#define ci_in_interrupt  in_interrupt
+#define ci_in_atomic     in_atomic
+
+
+/**********************************************************************
+ * Misc stuff.
+ */
+
+#ifdef BUG
+# define  CI_BOMB     BUG
+#endif
+
+ci_inline void* __ci_alloc(size_t n)
+{ return kmalloc(n, (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)); }
+
+ci_inline void* __ci_atomic_alloc(size_t n)
+{ return kmalloc(n, GFP_ATOMIC ); }
+
+ci_inline void  __ci_free(void* p)     { return kfree(p);   }
+ci_inline void* __ci_vmalloc(size_t n) { return vmalloc(n); }
+ci_inline void  __ci_vfree(void* p)    { return vfree(p);   }
+
+
+#if CI_MEMLEAK_DEBUG_ALLOC_TABLE
+  #define ci_alloc(s)     ci_alloc_memleak_debug (s, __FILE__, __LINE__)
+  #define ci_atomic_alloc(s)  ci_atomic_alloc_memleak_debug(s, __FILE__, __LINE__)
+  #define ci_free         ci_free_memleak_debug
+  #define ci_vmalloc(s)   ci_vmalloc_memleak_debug (s, __FILE__,__LINE__)
+  #define ci_vfree        ci_vfree_memleak_debug
+  #define ci_alloc_fn     ci_alloc_fn_memleak_debug
+  #define ci_vmalloc_fn   ci_vmalloc_fn_memleak_debug
+#else /* !CI_MEMLEAK_DEBUG_ALLOC_TABLE */
+  #define ci_alloc_fn     __ci_alloc
+  #define ci_vmalloc_fn   __ci_vmalloc
+#endif 
+
+#ifndef ci_alloc
+  #define ci_atomic_alloc __ci_atomic_alloc
+  #define ci_alloc        __ci_alloc
+  #define ci_free         __ci_free
+  #define ci_vmalloc      __ci_vmalloc
+  #define ci_vmalloc_fn   __ci_vmalloc
+  #define ci_vfree        __ci_vfree
+#endif
+
+#define ci_sprintf        sprintf
+#define ci_vsprintf       vsprintf
+#define ci_snprintf       snprintf
+#define ci_vsnprintf      vsnprintf
+#define ci_sscanf         sscanf
+
+
+#define CI_LOG_FN_DEFAULT  ci_log_syslog
+
+
+/*--------------------------------------------------------------------
+ *
+ * irqs_disabled - needed for kmap helpers on some kernels 
+ *
+ *--------------------------------------------------------------------*/
+#ifdef irqs_disabled
+# define ci_irqs_disabled irqs_disabled
+#else
+# if defined(__i386__) | defined(__x86_64__)
+#   define ci_irqs_disabled(x)                  \
+  ({                                            \
+    unsigned long flags;                        \
+    local_save_flags(flags);                    \
+    !(flags & (1<<9));                          \
+  })
+# else
+#  error "Need to implement irqs_disabled() for your architecture"
+# endif
+#endif
+
+
+/**********************************************************************
+ * kmap helpers. 
+ *
+ * Use ci_k(un)map for code paths which are not in an atomic context.
+ * For atomic code you need to use ci_k(un)map_in_atomic. This will grab
+ * one of the per-CPU kmap slots.
+ *
+ * NB in_interrupt != in_irq. If you don't know the difference then
+ * don't use kmap_in_atomic
+ *
+ * 2.4 allocates kmap slots by function. We are going to re-use the
+ * skb module's slot - we also use the same interlock
+ * 
+ * 2.6 allocates kmap slots by type as well as by function. We are
+ * going to use the currently (2.6.10) unsused SOFTIRQ slot 
+ *
+ */
+
+ci_inline void* ci_kmap(struct page *page) {
+  CI_DEBUG(if( ci_in_atomic() | ci_in_interrupt() | ci_in_irq() )  BUG());
+  return kmap(page);
+}
+
+ci_inline void ci_kunmap(struct page *page) {
+  kunmap(page);
+}
+
+#define CI_KM_SLOT KM_SOFTIRQ0
+
+
+typedef struct semaphore ci_semaphore_t;
+
+ci_inline void
+ci_sem_init (ci_semaphore_t *sem, int val) {
+  sema_init (sem, val);
+}
+
+ci_inline void
+ci_sem_down (ci_semaphore_t *sem) {
+  down (sem);
+}
+
+ci_inline int
+ci_sem_trydown (ci_semaphore_t *sem) {
+  return down_trylock (sem);
+}
+
+ci_inline void
+ci_sem_up (ci_semaphore_t *sem) {
+  up (sem);
+}
+
+ci_inline int
+ci_sem_get_count(ci_semaphore_t *sem) {
+  return sem->count.counter;
+}
+
+ci_inline void* ci_kmap_in_atomic(struct page *page) 
+{
+  CI_DEBUG(if( ci_in_irq() )  BUG());
+
+  /* iSCSI can call without in_interrupt() but with irqs_disabled()
+     and in a context that can't sleep, so we need to check that
+     too */
+  if(ci_in_interrupt() || ci_irqs_disabled())
+    return kmap_atomic(page, CI_KM_SLOT);
+  else
+    return kmap(page);
+}
+
+ci_inline void ci_kunmap_in_atomic(struct page *page, void* kaddr) 
+{
+  CI_DEBUG(if( ci_in_irq() )  BUG());
+
+  /* iSCSI can call without in_interrupt() but with irqs_disabled()
+     and in a context that can't sleep, so we need to check that
+     too */
+  if(ci_in_interrupt() || ci_irqs_disabled())
+    kunmap_atomic(kaddr, CI_KM_SLOT);
+  else
+    kunmap(page);
+}
+
+/**********************************************************************
+ * spinlock implementation: used by <ci/tools/spinlock.h>
+ */
+
+#define CI_HAVE_SPINLOCKS
+
+typedef ci_uintptr_t    			ci_lock_holder_t;
+#define ci_lock_thisthread 		(ci_lock_holder_t)current		       	
+#define ci_lock_no_holder     (ci_lock_holder_t)NULL
+
+typedef spinlock_t			ci_lock_i;
+typedef spinlock_t			ci_irqlock_i;
+typedef unsigned long			ci_irqlock_state_t;
+
+#define IRQLOCK_CYCLES  500000
+
+#define ci_lock_ctor_i(l)		spin_lock_init(l)
+#define ci_lock_dtor_i(l)		do{}while(0)
+#define ci_lock_lock_i(l)		spin_lock(l)
+#define ci_lock_trylock_i(l)		spin_trylock(l)
+#define ci_lock_unlock_i(l)		spin_unlock(l)
+
+#define ci_irqlock_ctor_i(l)		spin_lock_init(l)
+#define ci_irqlock_dtor_i(l)		do{}while(0)
+#define ci_irqlock_lock_i(l,s)		spin_lock_irqsave(l,*(s))
+#define ci_irqlock_unlock_i(l,s)	spin_unlock_irqrestore(l, *(s))
+
+
+/**********************************************************************
+ * register access
+ */
+
+#include <asm/io.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
+typedef volatile void __iomem*	ioaddr_t;
+#else
+typedef unsigned long ioaddr_t;
+#endif
+
+
+
+/**********************************************************************
+ * thread implementation -- kernel dependancies probably should be
+ * moved to driver/linux_kernel.h
+ */
+
+#define ci_linux_daemonize(name) daemonize(name)
+
+#include <linux/workqueue.h>
+
+
+typedef struct {
+  void*			(*fn)(void* arg);
+  void*			arg;
+  const char*		name;
+  int			thrd_id;
+  struct completion	exit_event;
+  struct work_struct	keventd_witem;
+} ci_kernel_thread_t;
+
+
+typedef ci_kernel_thread_t* cithread_t;
+
+
+extern int cithread_create(cithread_t* tid, void* (*fn)(void*), void* arg,
+			   const char* name);
+extern int cithread_detach(cithread_t kt);
+extern int cithread_join(cithread_t kt);
+
+
+/* Kernel sysctl variables. */
+extern int sysctl_tcp_wmem[3];
+extern int sysctl_tcp_rmem[3];
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#define LINUX_HAS_SYSCTL_MEM_MAX
+extern ci_uint32 sysctl_wmem_max;
+extern ci_uint32 sysctl_rmem_max;
+#endif
+
+
+/*--------------------------------------------------------------------
+ *
+ * ci_bigbuf_t: An abstraction of a large buffer.  Needed because in the
+ * Linux kernel, large buffers need to be allocated with vmalloc(), whereas
+ * smaller buffers should use kmalloc().  This abstraction chooses the
+ * appropriate mechansim.
+ *
+ *--------------------------------------------------------------------*/
+
+typedef struct {
+  char*		p;
+  int		is_vmalloc;
+} ci_bigbuf_t;
+
+
+ci_inline int ci_bigbuf_alloc(ci_bigbuf_t* bb, size_t bytes) {
+  if( bytes >= CI_PAGE_SIZE && ! ci_in_atomic() ) {
+    bb->is_vmalloc = 1;
+    if( (bb->p = vmalloc(bytes)) )  return 0;
+  }
+  bb->is_vmalloc = 0;
+  bb->p = kmalloc(bytes, ci_in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
+  return bb->p ? 0 : -ENOMEM;
+}
+
+ci_inline void ci_bigbuf_free(ci_bigbuf_t* bb) {
+  if( bb->is_vmalloc )  vfree(bb->p);
+  else                  kfree(bb->p);
+}
+
+ci_inline char* ci_bigbuf_ptr(ci_bigbuf_t* bb)
+{ return bb->p; }
+
+/**********************************************************************
+ * struct iovec abstraction (for Windows port)
+ */
+
+typedef struct iovec ci_iovec;
+
+/* Accessors for buffer/length */
+#define CI_IOVEC_BASE(i) ((i)->iov_base)
+#define CI_IOVEC_LEN(i)  ((i)->iov_len)
+
+/**********************************************************************
+ * Signals
+ */
+
+ci_inline void
+ci_send_sig(int signum)
+{
+  send_sig(signum, current, 0);
+}
+
+#endif  /* __CI_TOOLS_LINUX_KERNEL_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/sysdep.h b/drivers/xen/sfc_netback/ci/tools/sysdep.h
new file mode 100644
index 0000000..9be16dd
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/sysdep.h
@@ -0,0 +1,132 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_SYSDEP_H__
+#define __CI_TOOLS_SYSDEP_H__
+
+/* Make this header self-sufficient */
+#include <ci/compat.h>
+#include <ci/tools/log.h>
+#include <ci/tools/debug.h>
+
+
+/**********************************************************************
+ * Platform dependencies.
+ */
+
+#if defined(__KERNEL__)
+
+# if defined(__linux__)
+#  include <ci/tools/platform/linux_kernel.h>
+# elif defined(_WIN32)
+#  include <ci/tools/platform/win32_kernel.h>
+# elif defined(__sun__)
+#  include <ci/tools/platform/sunos_kernel.h>
+# else
+#  error Unknown platform.
+# endif
+
+#elif defined(_WIN32)
+
+# include <ci/tools/platform/win32.h>
+
+#elif defined(__unix__)
+
+# include <ci/tools/platform/unix.h>
+
+#else
+
+# error Unknown platform.
+
+#endif
+
+#if defined(__linux__)
+/*! Linux sendfile() support enable/disable. */
+# define CI_HAVE_SENDFILE            /* provide sendfile i/f */
+
+# define CI_HAVE_OS_NOPAGE
+#endif
+
+#if defined(__sun__)
+# define CI_HAVE_SENDFILE	     /* provide sendfile i/f */
+# define CI_HAVE_SENDFILEV           /* provide sendfilev i/f */
+
+# define CI_IOCTL_SENDFILE           /*  use efrm CI_SENDFILEV ioctl */
+#endif
+
+#if defined(_WIN32)
+typedef ci_uint32 ci_uerr_t; /* range of OS user-mode return codes */
+typedef ci_uint32 ci_kerr_t; /* range of OS kernel-mode return codes */
+#elif defined(__unix__)
+typedef ci_int32 ci_uerr_t; /* range of OS user-mode return codes */
+typedef ci_int32 ci_kerr_t; /* range of OS kernel-mode return codes */
+#endif
+
+
+/**********************************************************************
+ * Compiler and processor dependencies.
+ */
+
+#if defined(__GNUC__)
+
+#if defined(__i386__) || defined(__x86_64__)
+# include <ci/tools/platform/gcc_x86.h>
+#elif defined(__PPC__)
+#  include <ci/tools/platform/gcc_ppc.h>
+#elif defined(__ia64__)
+#  include <ci/tools/platform/gcc_ia64.h>
+#else
+# error Unknown processor.
+#endif
+
+#elif defined(_MSC_VER)
+
+#if defined(__i386__)
+# include <ci/tools/platform/msvc_x86.h>
+# elif defined(__x86_64__)
+# include <ci/tools/platform/msvc_x86_64.h>
+#else
+# error Unknown processor.
+#endif
+
+#elif defined(__PGI)
+
+# include <ci/tools/platform/pg_x86.h>
+
+#elif defined(__INTEL_COMPILER)
+
+/* Intel compilers v7 claim to be very gcc compatible. */
+# include <ci/tools/platform/gcc_x86.h>
+
+#else
+# error Unknown compiler.
+#endif
+
+
+#endif  /* __CI_TOOLS_SYSDEP_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netfront/Makefile b/drivers/xen/sfc_netfront/Makefile
new file mode 100644
index 0000000..0e4a54b
--- /dev/null
+++ b/drivers/xen/sfc_netfront/Makefile
@@ -0,0 +1,11 @@
+EXTRA_CFLAGS += -Idrivers/xen/sfc_netfront -Idrivers/xen/sfc_netutil -Idrivers/xen/netfront
+EXTRA_CFLAGS += -D__ci_driver__
+EXTRA_CFLAGS += -Werror
+
+ifdef GCOV
+EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
+endif
+
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND)	:= sfc_netfront.o
+
+sfc_netfront-objs := accel_msg.o accel_bufs.o accel_netfront.o accel_vi.o accel_xenbus.o accel_tso.o accel_ssr.o accel_debugfs.o falcon_event.o falcon_vi.o pt_tx.o vi_init.o
diff --git a/drivers/xen/sfc_netfront/accel.h b/drivers/xen/sfc_netfront/accel.h
new file mode 100644
index 0000000..a915aaa
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel.h
@@ -0,0 +1,495 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_H
+#define NETFRONT_ACCEL_H
+
+#include "accel_msg_iface.h"
+#include "accel_cuckoo_hash.h"
+#include "accel_bufs.h"
+
+#include "etherfabric/ef_vi.h"
+
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/version.h>
+#include <linux/list.h>
+
+enum netfront_accel_post_status {
+	NETFRONT_ACCEL_STATUS_GOOD,
+	NETFRONT_ACCEL_STATUS_BUSY,
+	NETFRONT_ACCEL_STATUS_CANT
+};
+
+#define NETFRONT_ACCEL_STATS 1
+#if NETFRONT_ACCEL_STATS
+#define NETFRONT_ACCEL_STATS_OP(x) x
+#else
+#define NETFRONT_ACCEL_STATS_OP(x)
+#endif
+
+
+enum netfront_accel_msg_state {
+	NETFRONT_ACCEL_MSG_NONE = 0,
+	NETFRONT_ACCEL_MSG_HELLO = 1,
+	NETFRONT_ACCEL_MSG_HW = 2
+};
+
+
+typedef struct {
+	u32 in_progress;
+	u32 total_len;
+	struct sk_buff *skb;
+} netfront_accel_jumbo_state;
+
+
+struct netfront_accel_ssr_state {
+	/** List of tracked connections. */
+	struct list_head conns;
+
+	/** Free efx_ssr_conn instances. */
+	struct list_head free_conns;
+};
+
+
+struct netfront_accel_netdev_stats {
+	/* Fastpath stats. */
+	u32 fastpath_rx_pkts;
+	u32 fastpath_rx_bytes;
+	u32 fastpath_rx_errors;
+	u32 fastpath_tx_pkts; 
+	u32 fastpath_tx_bytes;
+	u32 fastpath_tx_errors;
+};
+
+
+struct netfront_accel_netdev_dbfs {
+	struct dentry *fastpath_rx_pkts;
+	struct dentry *fastpath_rx_bytes;
+	struct dentry *fastpath_rx_errors;
+	struct dentry *fastpath_tx_pkts; 
+	struct dentry *fastpath_tx_bytes;
+	struct dentry *fastpath_tx_errors;
+};
+
+
+struct netfront_accel_stats {
+	/** Fast path events */
+	u64 fastpath_tx_busy;
+
+	/** TX DMA queue status */
+	u64 fastpath_tx_completions;
+
+	/** The number of events processed. */
+	u64 event_count;
+
+	/** Number of frame trunc events seen on fastpath */
+	u64 fastpath_frm_trunc;
+
+	/** Number of rx discard (bad crc) events seen on fastpath */
+	u64 fastpath_crc_bad;
+
+	/** Number of rx discard (bad csum) events seen on fastpath */
+	u64 fastpath_csum_bad;
+
+	/** Number of rx discard (bad rights) events seen on fastpath */
+	u64 fastpath_rights_bad;
+
+	/** Number of rx discard ("other") events seen on fastpath */
+	u64 fastpath_discard_other;
+
+	/** Number of no rx descriptor trunc events seen on fastpath */
+	u64 rx_no_desc_trunc;
+
+	/** The number of misc bad events processed. */
+	u64 bad_event_count;
+
+	/** Number of events dealt with in poll loop */
+	u32 events_per_poll_max;
+	u32 events_per_poll_tx_max;
+	u32 events_per_poll_rx_max;
+
+	/** Largest number of concurrently outstanding tx descriptors */
+	u32 fastpath_tx_pending_max;
+
+	/** The number of events since the last interrupts. */
+	u32 event_count_since_irq;
+
+	/** The max number of events between interrupts. */
+	u32 events_per_irq_max;
+
+	/** The number of interrupts. */
+	u64 irq_count;
+
+	/** The number of useless interrupts. */
+	u64 useless_irq_count;
+
+	/** The number of polls scheduled. */
+	u64 poll_schedule_count;
+
+	/** The number of polls called. */
+	u64 poll_call_count;
+
+	/** The number of rechecks. */
+	u64 poll_reschedule_count;
+
+	/** Number of times we've called netif_stop_queue/netif_wake_queue */
+	u64 queue_stops;
+	u64 queue_wakes;
+
+	/** SSR stats */
+	u64 ssr_bursts;
+	u64 ssr_drop_stream;
+	u64 ssr_misorder;
+	u64 ssr_slow_start;
+	u64 ssr_merges;
+	u64 ssr_too_many;
+	u64 ssr_new_stream;
+};
+
+
+struct netfront_accel_dbfs {
+	struct dentry *fastpath_tx_busy;
+	struct dentry *fastpath_tx_completions;
+	struct dentry *fastpath_tx_pending_max;
+	struct dentry *fastpath_frm_trunc;
+	struct dentry *fastpath_crc_bad;
+	struct dentry *fastpath_csum_bad;
+	struct dentry *fastpath_rights_bad;
+	struct dentry *fastpath_discard_other;
+	struct dentry *rx_no_desc_trunc;
+	struct dentry *event_count;
+	struct dentry *bad_event_count;
+	struct dentry *events_per_poll_max;
+	struct dentry *events_per_poll_rx_max;
+	struct dentry *events_per_poll_tx_max;
+	struct dentry *event_count_since_irq;
+	struct dentry *events_per_irq_max;
+	struct dentry *irq_count;
+	struct dentry *useless_irq_count;
+	struct dentry *poll_schedule_count;
+	struct dentry *poll_call_count;
+	struct dentry *poll_reschedule_count;
+	struct dentry *queue_stops;
+	struct dentry *queue_wakes;
+	struct dentry *ssr_bursts;
+	struct dentry *ssr_drop_stream;
+	struct dentry *ssr_misorder;
+	struct dentry *ssr_slow_start;
+	struct dentry *ssr_merges;
+	struct dentry *ssr_too_many;
+	struct dentry *ssr_new_stream;
+};
+
+
+typedef struct netfront_accel_vnic {
+	struct netfront_accel_vnic *next;
+	
+	struct mutex vnic_mutex;
+
+	spinlock_t tx_lock;
+
+	struct netfront_accel_bufpages bufpages;
+	struct netfront_accel_bufinfo *rx_bufs;
+	struct netfront_accel_bufinfo *tx_bufs;
+	
+	/** Hardware & VI state */
+	ef_vi vi;
+
+	ef_vi_state *vi_state;
+
+	ef_eventq_state evq_state;
+
+	void *evq_mapping;
+
+	/** Hardware dependant state */
+	union {
+		struct {
+			/** Falcon A or B */
+			enum net_accel_hw_type type; 
+			u32 *evq_rptr;
+			u32 *doorbell;
+			void *evq_rptr_mapping;
+			void *doorbell_mapping;
+			void *txdmaq_mapping;
+			void *rxdmaq_mapping;
+		} falcon;
+	} hw;
+  
+	/** RX DMA queue status */
+	u32 rx_dma_level;
+
+	/** Number of RX descriptors waiting to be pushed to the card. */
+	u32 rx_dma_batched;
+#define NETFRONT_ACCEL_RX_DESC_BATCH 16
+
+	/**
+	 * Hash table of remote mac addresses to decide whether to try
+	 * fast path
+	 */
+	cuckoo_hash_table fastpath_table;
+	spinlock_t table_lock;
+
+	/** the local mac address of virtual interface we're accelerating */
+	u8 mac[ETH_ALEN];
+
+	int rx_pkt_stride;
+	int rx_skb_stride;
+
+	/**
+	 * Keep track of fragments of jumbo packets as events are
+	 * delivered by NIC 
+	 */
+	netfront_accel_jumbo_state jumbo_state;
+
+	struct net_device *net_dev;
+
+	/** These two gate the enabling of fast path operations */
+	int frontend_ready;
+	int backend_netdev_up;
+
+	int irq_enabled;
+	spinlock_t irq_enabled_lock;
+
+	int tx_enabled;
+
+	int poll_enabled;
+
+	/** A spare slot for a TX packet.  This is treated as an
+	 * extension of the DMA queue.  Reads require either
+	 * netfront's tx_lock or the vnic tx_lock; writes require both
+	 * locks */
+	struct sk_buff *tx_skb;
+
+	/** Keep track of fragments of SSR packets */
+	struct netfront_accel_ssr_state ssr_state;
+
+	struct xenbus_device *dev;
+
+	/** Event channel for messages */
+	int msg_channel;
+	int msg_channel_irq;
+
+	/** Event channel for network interrupts. */
+	int net_channel;
+	int net_channel_irq;
+
+	struct net_accel_shared_page *shared_page;
+
+	grant_ref_t ctrl_page_gnt;
+	grant_ref_t msg_page_gnt;
+
+	/** Message Qs, 1 each way. */
+	sh_msg_fifo2 to_dom0;
+	sh_msg_fifo2 from_dom0;
+
+	enum netfront_accel_msg_state msg_state;
+
+	/** Watch on accelstate */
+	struct xenbus_watch backend_accel_watch;
+	/** Watch on frontend's MAC address */
+	struct xenbus_watch mac_address_watch;
+
+	/** Work to process received irq/msg */
+	struct work_struct msg_from_bend;
+
+	/** Wait queue for changes in accelstate. */
+	wait_queue_head_t state_wait_queue;
+
+	/** The current accelstate of this driver. */
+	XenbusState frontend_state;
+
+	/** The most recent accelstate seen by the xenbus watch. */
+	XenbusState backend_state;
+
+	/** Non-zero if we should reject requests to connect. */
+	int removing;
+
+	/** Non-zero if the domU shared state has been initialised. */
+	int domU_state_is_setup;
+
+	/** Non-zero if the dom0 shared state has been initialised. */
+	int dom0_state_is_setup;
+
+	/* Those statistics that are added to the netdev stats */
+	struct netfront_accel_netdev_stats netdev_stats;
+	struct netfront_accel_netdev_stats stats_last_read;
+#ifdef CONFIG_DEBUG_FS
+	struct netfront_accel_netdev_dbfs netdev_dbfs;
+#endif
+
+	/* These statistics are internal and optional */
+#if NETFRONT_ACCEL_STATS
+	struct netfront_accel_stats stats;
+#ifdef CONFIG_DEBUG_FS
+	struct netfront_accel_dbfs dbfs;
+#endif
+#endif
+
+	/** Debufs fs dir for this interface */
+	struct dentry *dbfs_dir;
+} netfront_accel_vnic;
+
+
+/* Module parameters */
+extern unsigned sfc_netfront_max_pages;
+extern unsigned sfc_netfront_buffer_split;
+
+extern const char *frontend_name;
+extern struct netfront_accel_hooks accel_hooks;
+extern struct workqueue_struct *netfront_accel_workqueue;
+
+
+extern
+void netfront_accel_vi_ctor(netfront_accel_vnic *vnic);
+
+extern
+int netfront_accel_vi_init(netfront_accel_vnic *vnic, 
+			   struct net_accel_msg_hw *hw_msg);
+
+extern
+void netfront_accel_vi_dtor(netfront_accel_vnic *vnic);
+
+
+/**
+ * Add new buffers which have been registered with the NIC.
+ *
+ * @v   vnic     The vnic instance to process the response.
+ *
+ * The buffers contained in the message are added to the buffer pool.
+ */
+extern
+void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx);
+
+/**
+ * Put a packet on the tx DMA queue.
+ *
+ * @v  vnic	 The vnic instance to accept the packet.
+ * @v  skb	 A sk_buff to send.
+ *
+ * Attempt to send a packet.  On success, the skb is owned by the DMA
+ * queue and will be released when the completion event arrives.
+ */
+extern enum netfront_accel_post_status
+netfront_accel_vi_tx_post(netfront_accel_vnic *vnic,
+			  struct sk_buff *skb);
+
+
+/**
+ * Process events in response to an interrupt.
+ *
+ * @v   vnic       The vnic instance to poll.
+ * @v   rx_packets The maximum number of rx packets to process.
+ * @ret rx_done    The number of rx packets processed.
+ *
+ * The vnic will process events until there are no more events
+ * remaining or the specified number of rx packets has been processed.
+ * The split from the interrupt call is to allow Linux NAPI
+ * polling.
+ */
+extern
+int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets);
+
+
+/**
+ * Iterate over the fragments of a packet buffer.
+ *
+ * @v   skb      The packet buffer to examine.
+ * @v   idx      A variable name for the fragment index.
+ * @v   data     A variable name for the address of the fragment data.
+ * @v   length   A variable name for the fragment length.
+ * @v   code     A section of code to execute for each fragment.
+ *
+ * This macro iterates over the fragments in a packet buffer and
+ * executes the code for each of them.
+ */
+#define NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT(skb, frag_idx,		\
+						 frag_data, frag_len,   \
+						 code)			\
+	do {								\
+		int frag_idx;						\
+		void *frag_data;					\
+		unsigned int	  frag_len;				\
+									\
+		frag_data = skb->data;					\
+		frag_len = skb_headlen(skb);				\
+		frag_idx = 0;						\
+		while (1) { /* For each fragment */			\
+			code;						\
+			if (frag_idx >= skb_shinfo(skb)->nr_frags) {	\
+				break;					\
+			} else {					\
+				skb_frag_t *fragment;			\
+				fragment = &skb_shinfo(skb)->frags[frag_idx]; \
+				frag_len = skb_frag_size(fragment);	\
+				frag_data = ((void*)page_address(skb_frag_page(fragment)) \
+					     + fragment->page_offset);	\
+			};						\
+			frag_idx++;					\
+		}							\
+	} while(0)
+
+static inline
+void netfront_accel_disable_net_interrupts(netfront_accel_vnic *vnic)
+{
+	mask_evtchn(vnic->net_channel);
+}
+
+static inline
+void netfront_accel_enable_net_interrupts(netfront_accel_vnic *vnic)
+{
+	unmask_evtchn(vnic->net_channel);
+}
+
+void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac,
+				    u32 ip, u16 port, u8 protocol);
+
+/* Process an IRQ received from back end driver */
+irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context);
+irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+extern void netfront_accel_msg_from_bend(struct work_struct *context);
+#else
+extern void netfront_accel_msg_from_bend(void *context);
+#endif
+
+extern void vnic_stop_fastpath(netfront_accel_vnic *vnic);
+
+extern int netfront_accel_probe(struct net_device *net_dev, 
+				struct xenbus_device *dev);
+extern int netfront_accel_remove(struct xenbus_device *dev);
+extern void netfront_accel_set_closing(netfront_accel_vnic *vnic);
+
+extern int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic);
+
+extern void netfront_accel_debugfs_init(void);
+extern void netfront_accel_debugfs_fini(void);
+extern int netfront_accel_debugfs_create(netfront_accel_vnic *vnic);
+extern int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic);
+
+#endif /* NETFRONT_ACCEL_H */
diff --git a/drivers/xen/sfc_netfront/accel_bufs.c b/drivers/xen/sfc_netfront/accel_bufs.c
new file mode 100644
index 0000000..f96f73c
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_bufs.c
@@ -0,0 +1,393 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <xen/gnttab.h>
+
+#include "accel_bufs.h"
+#include "accel_util.h"
+
+#include "accel.h"
+
+
+static int 
+netfront_accel_alloc_buf_desc_blocks(struct netfront_accel_bufinfo *manager,
+				     int pages)
+{
+	manager->desc_blocks = 
+		kzalloc(sizeof(struct netfront_accel_pkt_desc *) * 
+			NETFRONT_ACCEL_BUF_NUM_BLOCKS(pages), GFP_KERNEL);
+	if (manager->desc_blocks == NULL) {
+		return -ENOMEM;
+	}
+	
+	return 0;
+}
+
+static int 
+netfront_accel_alloc_buf_lists(struct netfront_accel_bufpages *bufpages,
+			       int pages)
+{
+	bufpages->page_list = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+	if (bufpages->page_list == NULL) {
+		return -ENOMEM;
+	}
+
+	bufpages->grant_list = kzalloc(pages * sizeof(grant_ref_t), GFP_KERNEL);
+	if (bufpages->grant_list == NULL) {
+		kfree(bufpages->page_list);
+		bufpages->page_list = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+
+int netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages,
+				    struct netfront_accel_bufinfo *rx_manager,
+				    struct netfront_accel_bufinfo *tx_manager,
+				    int pages)
+{
+	int n, rc;
+
+	if ((rc = netfront_accel_alloc_buf_desc_blocks
+	     (rx_manager, pages - (pages / sfc_netfront_buffer_split))) < 0) {
+		goto rx_fail;
+	}
+
+	if ((rc = netfront_accel_alloc_buf_desc_blocks
+	     (tx_manager, pages / sfc_netfront_buffer_split)) < 0) {
+		goto tx_fail;
+	}
+
+	if ((rc = netfront_accel_alloc_buf_lists(bufpages, pages)) < 0) {
+		goto lists_fail;
+	}
+
+	for (n = 0; n < pages; n++) {
+		void *tmp = (void*)__get_free_page(GFP_KERNEL);
+		if (tmp == NULL)
+			break;
+
+		bufpages->page_list[n] = tmp;
+	}
+
+	if (n != pages) {
+		EPRINTK("%s: not enough pages: %d != %d\n", __FUNCTION__, n, 
+			pages);
+		for (; n >= 0; n--)
+			free_page((unsigned long)(bufpages->page_list[n]));
+		rc = -ENOMEM;
+		goto pages_fail;
+	}
+
+	bufpages->max_pages = pages;
+	bufpages->page_reqs = 0;
+
+	return 0;
+
+ pages_fail:
+	kfree(bufpages->page_list);
+	kfree(bufpages->grant_list);
+
+	bufpages->page_list = NULL;
+	bufpages->grant_list = NULL;
+ lists_fail:
+	kfree(tx_manager->desc_blocks);
+	tx_manager->desc_blocks = NULL;
+
+ tx_fail:
+	kfree(rx_manager->desc_blocks);
+	rx_manager->desc_blocks = NULL;
+ rx_fail:
+	return rc;
+}
+
+
+void netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages,
+				    struct netfront_accel_bufinfo *rx_manager,
+				    struct netfront_accel_bufinfo *tx_manager)
+{
+	int i;
+
+	for (i = 0; i < bufpages->max_pages; i++) {
+		if (bufpages->grant_list[i] != 0)
+			net_accel_ungrant_page(bufpages->grant_list[i]);
+		free_page((unsigned long)(bufpages->page_list[i]));
+	}
+
+	if (bufpages->max_pages) {
+		kfree(bufpages->page_list);
+		kfree(bufpages->grant_list);
+		kfree(rx_manager->desc_blocks);
+		kfree(tx_manager->desc_blocks);
+	}
+}
+
+
+/*
+ * Allocate memory for the buffer manager and create a lock.  If no
+ * lock is supplied its own is allocated.
+ */
+struct netfront_accel_bufinfo *netfront_accel_init_bufs(spinlock_t *lock)
+{
+	struct netfront_accel_bufinfo *res = kmalloc(sizeof(*res), GFP_KERNEL);
+	if (res != NULL) {
+		res->npages = res->nused = 0;
+		res->first_free = -1;
+
+		if (lock == NULL) {
+			res->lock = kmalloc(sizeof(*res->lock), GFP_KERNEL);
+			if (res->lock == NULL) {
+				kfree(res);
+				return NULL;
+			}
+			spin_lock_init(res->lock);
+			res->internally_locked = 1;
+		} else {
+			res->lock = lock;
+			res->internally_locked = 0;
+		}
+		
+		res->desc_blocks = NULL;
+	}
+
+	return res;
+}
+
+
+void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *bufs)
+{
+	if (bufs->internally_locked)
+		kfree(bufs->lock);
+	kfree(bufs);
+}
+
+
+int netfront_accel_buf_map_request(struct xenbus_device *dev,
+				   struct netfront_accel_bufpages *bufpages,
+				   struct net_accel_msg *msg, 
+				   int pages, int offset)
+{
+	int i, mfn;
+	int err;
+
+	net_accel_msg_init(msg, NET_ACCEL_MSG_MAPBUF);
+
+	BUG_ON(pages > NET_ACCEL_MSG_MAX_PAGE_REQ);
+
+	msg->u.mapbufs.pages = pages;
+
+	for (i = 0; i < msg->u.mapbufs.pages; i++) {
+		/* 
+		 * This can happen if we tried to send this message
+		 * earlier but the queue was full.
+		 */
+		if (bufpages->grant_list[offset+i] != 0) {
+			msg->u.mapbufs.grants[i] = 
+				bufpages->grant_list[offset+i];
+			continue;
+		}
+
+		mfn = virt_to_mfn(bufpages->page_list[offset+i]);
+		VPRINTK("%s: Granting page %d, mfn %08x\n",
+			__FUNCTION__, i, mfn);
+
+		bufpages->grant_list[offset+i] =
+			net_accel_grant_page(dev, mfn, 0);
+		msg->u.mapbufs.grants[i] = bufpages->grant_list[offset+i];
+
+		if (msg->u.mapbufs.grants[i] < 0) {
+			EPRINTK("%s: Failed to grant buffer: %d\n",
+				__FUNCTION__, msg->u.mapbufs.grants[i]);
+			err = -EIO;
+			goto error;
+		}
+	}
+
+	/* This is interpreted on return as the offset in the the page_list */
+	msg->u.mapbufs.reqid = offset;
+
+	return 0;
+
+error:
+	/* Ungrant all the pages we've successfully granted. */
+	for (i--; i >= 0; i--) {
+		net_accel_ungrant_page(bufpages->grant_list[offset+i]);
+		bufpages->grant_list[offset+i] = 0;
+	}
+	return err;
+}
+
+
+/* Process a response to a buffer request. */
+int netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages,
+			    struct netfront_accel_bufinfo *manager, 
+			    struct net_accel_msg *msg)
+{
+	int msg_pages, page_offset, i, newtot;
+	int old_block_count, new_block_count;
+	u32 msg_buf;
+	unsigned long flags;
+
+	VPRINTK("%s: manager %p msg %p\n", __FUNCTION__, manager, msg);
+
+	BUG_ON(msg->id != (NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY));
+
+	msg_pages = msg->u.mapbufs.pages;
+	msg_buf = msg->u.mapbufs.buf;
+	page_offset = msg->u.mapbufs.reqid;
+
+	spin_lock_irqsave(manager->lock, flags);
+	newtot = manager->npages + msg_pages;
+	old_block_count = 
+		(manager->npages + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >>
+		NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
+	new_block_count = 
+		(newtot + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >>
+		NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
+
+	for (i = old_block_count; i < new_block_count; i++) {
+		struct netfront_accel_pkt_desc *block;
+		if (manager->desc_blocks[i] != NULL) {
+			VPRINTK("Not needed\n");
+			continue;
+		}
+		block = kzalloc(NETFRONT_ACCEL_BUFS_PER_BLOCK * 
+				sizeof(netfront_accel_pkt_desc), GFP_ATOMIC);
+		if (block == NULL) {
+			spin_unlock_irqrestore(manager->lock, flags);
+			return -ENOMEM;
+		}
+		manager->desc_blocks[i] = block;
+	}
+	for (i = manager->npages; i < newtot; i++) {
+		int k, j = i - manager->npages;
+		int block_num;
+		int block_idx;
+		struct netfront_accel_pkt_desc *pkt;
+
+		block_num = i >> NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
+		block_idx = (NETFRONT_ACCEL_BUFS_PER_PAGE*i)
+			& (NETFRONT_ACCEL_BUFS_PER_BLOCK-1);
+
+		pkt = manager->desc_blocks[block_num] + block_idx;
+		
+		for (k = 0; k < NETFRONT_ACCEL_BUFS_PER_PAGE; k++) {
+			BUG_ON(page_offset + j >= bufpages->max_pages);
+
+			pkt[k].buf_id = NETFRONT_ACCEL_BUFS_PER_PAGE * i + k;
+			pkt[k].pkt_kva = bufpages->page_list[page_offset + j] +
+				(PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) * k;
+			pkt[k].pkt_buff_addr = msg_buf +
+				(PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) * 
+				(NETFRONT_ACCEL_BUFS_PER_PAGE * j + k);
+			pkt[k].next_free = manager->first_free;
+			manager->first_free = pkt[k].buf_id;
+			*(int*)(pkt[k].pkt_kva) = pkt[k].buf_id;
+
+			VPRINTK("buf %d desc %p kva %p buffaddr %x\n",
+				pkt[k].buf_id, &(pkt[k]), pkt[k].pkt_kva, 
+				pkt[k].pkt_buff_addr);
+		}
+	}
+	manager->npages = newtot;
+	spin_unlock_irqrestore(manager->lock, flags);
+	VPRINTK("Added %d pages. Total is now %d\n", msg_pages,
+		manager->npages);
+	return 0;
+}
+
+
+netfront_accel_pkt_desc *
+netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id)
+{
+	netfront_accel_pkt_desc *pkt;
+	int block_num = id >> NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT;
+	int block_idx = id & (NETFRONT_ACCEL_BUFS_PER_BLOCK - 1);
+	BUG_ON(id >= manager->npages * NETFRONT_ACCEL_BUFS_PER_PAGE);
+	BUG_ON(block_idx >= NETFRONT_ACCEL_BUFS_PER_BLOCK);
+	pkt = manager->desc_blocks[block_num] + block_idx;
+	return pkt;
+}
+
+
+/* Allocate a buffer from the buffer manager */
+netfront_accel_pkt_desc *
+netfront_accel_buf_get(struct netfront_accel_bufinfo *manager)
+{
+	int bufno = -1;
+	netfront_accel_pkt_desc *buf = NULL;
+	unsigned long flags = 0;
+
+	/* Any spare? */
+	if (manager->first_free == -1)
+		return NULL;
+	/* Take lock */
+	if (manager->internally_locked)
+		spin_lock_irqsave(manager->lock, flags);
+	bufno = manager->first_free;
+	if (bufno != -1) {
+		buf = netfront_accel_buf_find(manager, bufno);
+		manager->first_free = buf->next_free;
+		manager->nused++;
+	}
+	/* Release lock */
+	if (manager->internally_locked)
+		spin_unlock_irqrestore(manager->lock, flags);
+
+	/* Tell the world */
+	VPRINTK("Allocated buffer %i, buffaddr %x\n", bufno,
+		buf->pkt_buff_addr);
+
+	return buf;
+}
+
+
+/* Release a buffer back to the buffer manager pool */
+int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager, u16 id)
+{
+	netfront_accel_pkt_desc *buf = netfront_accel_buf_find(manager, id);
+	unsigned long flags = 0;
+	unsigned was_empty = 0;
+	int bufno = id;
+
+	VPRINTK("Freeing buffer %i\n", id);
+	BUG_ON(id == (u16)-1);
+
+	if (manager->internally_locked)
+		spin_lock_irqsave(manager->lock, flags);
+
+	if (manager->first_free == -1)
+		was_empty = 1;
+
+	buf->next_free = manager->first_free;
+	manager->first_free = bufno;
+	manager->nused--;
+
+	if (manager->internally_locked)
+		spin_unlock_irqrestore(manager->lock, flags);
+
+	return was_empty;
+}
diff --git a/drivers/xen/sfc_netfront/accel_bufs.h b/drivers/xen/sfc_netfront/accel_bufs.h
new file mode 100644
index 0000000..4ff3eaa
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_bufs.h
@@ -0,0 +1,181 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_BUFS_H
+#define NETFRONT_ACCEL_BUFS_H
+
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <xen/xenbus.h>
+
+#include "accel_msg_iface.h"
+
+
+/*! Buffer descriptor structure */
+typedef struct netfront_accel_pkt_desc {
+	int buf_id;
+	u32 pkt_buff_addr;
+	void *pkt_kva;
+	/* This is the socket buffer currently married to this buffer */
+	struct sk_buff *skb;
+	int next_free;
+} netfront_accel_pkt_desc;
+
+
+#define NETFRONT_ACCEL_DEFAULT_BUF_PAGES (384)
+#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT (4)
+#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK		\
+	(1 << (NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT))
+#define NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT (1)
+#define NETFRONT_ACCEL_BUFS_PER_PAGE			\
+	(1 << (NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT))
+#define NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT		\
+	(NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT +     \
+	 NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT)
+#define NETFRONT_ACCEL_BUFS_PER_BLOCK			\
+	(1 << NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT)
+#define NETFRONT_ACCEL_BUF_NUM_BLOCKS(max_pages)			\
+	(((max_pages)+NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK-1) /		\
+	 NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK)
+
+/*! Buffer management structure. */
+struct netfront_accel_bufinfo {
+	/* number added to this manager */
+	unsigned npages;
+	/* number currently used from this manager */
+	unsigned nused;
+
+	int first_free;
+
+	int internally_locked;
+	spinlock_t *lock;
+
+	/*
+	 * array of pointers (length NETFRONT_ACCEL_BUF_NUM_BLOCKS) to
+	 * pkt descs
+	 */
+	struct netfront_accel_pkt_desc **desc_blocks; 
+};
+
+
+struct netfront_accel_bufpages {
+	/* length of lists of pages/grants */
+	int max_pages;
+	/* list of pages allocated for network buffers */
+	void **page_list;
+	/* list of grants for the above pages */
+	grant_ref_t *grant_list;
+	
+	/* number of page requests that have been made */
+	unsigned page_reqs;
+};
+
+
+/*! Allocate memory for the buffer manager, set up locks etc.
+ * Optionally takes a lock to use, if not supplied it makes its own.
+ *
+ * \return pointer to netfront_accel_bufinfo structure that represents the
+ * buffer manager
+ */
+extern struct netfront_accel_bufinfo *
+netfront_accel_init_bufs(spinlock_t *lock);
+
+/*! Allocate memory for the buffers
+ */
+extern int
+netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages,
+				struct netfront_accel_bufinfo *rx_res,
+				struct netfront_accel_bufinfo *tx_res,
+				int pages);
+extern void
+netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages,
+			       struct netfront_accel_bufinfo *rx_res,
+			       struct netfront_accel_bufinfo *tx_res);
+
+/*! Release memory for the buffer manager, buffers, etc.
+ *
+ * \param manager pointer to netfront_accel_bufinfo structure that
+ * represents the buffer manager
+ */
+extern void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *manager);
+
+/*! Release a buffer.
+ *
+ * \param manager  The buffer manager which owns the buffer.
+ * \param id   The buffer identifier.
+ */
+extern int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager, 
+				  u16 id);
+
+/*! Get the packet descriptor associated with a buffer id.
+ *
+ * \param manager  The buffer manager which owns the buffer.
+ * \param id       The buffer identifier.
+ *
+ * The returned value is the packet descriptor for this buffer.
+ */
+extern netfront_accel_pkt_desc *
+netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id);
+
+
+/*! Fill out a message request for some buffers to be mapped by the
+ * back end driver
+ * 
+ * \param manager The buffer manager 
+ * \param msg Pointer to an ef_msg to complete.
+ * \return 0 on success
+ */
+extern int 
+netfront_accel_buf_map_request(struct xenbus_device *dev,
+			       struct netfront_accel_bufpages *bufpages,
+			       struct net_accel_msg *msg, 
+			       int pages, int offset);
+
+/*! Process a response to a buffer request. 
+ * 
+ * Deal with a received message from the back end in response to our
+ * request for buffers
+ * 
+ * \param manager The buffer manager
+ * \param msg The received message from the back end describing new
+ * buffers
+ * \return 0 on success
+ */
+extern int 
+netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages,
+			struct netfront_accel_bufinfo *manager,
+			struct net_accel_msg *msg);
+
+
+/*! Allocate a buffer from the buffer manager 
+ *
+ * \param manager The buffer manager data structure
+ * \param id On exit, the id of the buffer allocated
+ * \return Pointer to buffer descriptor.
+ */
+struct netfront_accel_pkt_desc *
+netfront_accel_buf_get(struct netfront_accel_bufinfo *manager);
+
+#endif /* NETFRONT_ACCEL_BUFS_H */
+
diff --git a/drivers/xen/sfc_netfront/accel_debugfs.c b/drivers/xen/sfc_netfront/accel_debugfs.c
new file mode 100644
index 0000000..cd2d2c5
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_debugfs.c
@@ -0,0 +1,227 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+
+#include "accel.h"
+
+#if defined(CONFIG_DEBUG_FS)
+static struct dentry *sfc_debugfs_root = NULL;
+#endif
+
+void netfront_accel_debugfs_init(void) 
+{
+#if defined(CONFIG_DEBUG_FS)
+	sfc_debugfs_root = debugfs_create_dir(frontend_name, NULL);
+#endif
+}
+
+
+void netfront_accel_debugfs_fini(void)
+{
+#if defined(CONFIG_DEBUG_FS)
+	if (sfc_debugfs_root)
+		debugfs_remove(sfc_debugfs_root);
+#endif
+}
+
+
+int netfront_accel_debugfs_create(netfront_accel_vnic *vnic)
+{
+#if defined(CONFIG_DEBUG_FS)
+	if (sfc_debugfs_root == NULL)
+		return -ENOENT;
+
+	vnic->dbfs_dir = debugfs_create_dir(vnic->net_dev->name, 
+					    sfc_debugfs_root);
+	if (vnic->dbfs_dir == NULL)
+		return -ENOMEM;
+
+	vnic->netdev_dbfs.fastpath_rx_pkts = debugfs_create_u32
+		("fastpath_rx_pkts", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_pkts);
+	vnic->netdev_dbfs.fastpath_rx_bytes = debugfs_create_u32
+		("fastpath_rx_bytes", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_bytes);
+	vnic->netdev_dbfs.fastpath_rx_errors = debugfs_create_u32
+		("fastpath_rx_errors", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_errors);
+	vnic->netdev_dbfs.fastpath_tx_pkts = debugfs_create_u32
+		("fastpath_tx_pkts", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_pkts);
+	vnic->netdev_dbfs.fastpath_tx_bytes = debugfs_create_u32
+		("fastpath_tx_bytes", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_bytes);
+	vnic->netdev_dbfs.fastpath_tx_errors = debugfs_create_u32
+		("fastpath_tx_errors", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_errors);
+
+#if NETFRONT_ACCEL_STATS
+	vnic->dbfs.irq_count = debugfs_create_u64
+		("irq_count", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.irq_count);
+	vnic->dbfs.useless_irq_count = debugfs_create_u64
+		("useless_irq_count", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.useless_irq_count);
+	vnic->dbfs.poll_schedule_count = debugfs_create_u64
+		("poll_schedule_count", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.poll_schedule_count);
+	vnic->dbfs.poll_call_count = debugfs_create_u64
+		("poll_call_count", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.poll_call_count);
+	vnic->dbfs.poll_reschedule_count = debugfs_create_u64
+		("poll_reschedule_count", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.poll_reschedule_count);
+	vnic->dbfs.queue_stops = debugfs_create_u64
+		("queue_stops", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.queue_stops);
+	vnic->dbfs.queue_wakes = debugfs_create_u64
+		("queue_wakes", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.queue_wakes);
+	vnic->dbfs.ssr_bursts = debugfs_create_u64
+		("ssr_bursts", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.ssr_bursts);
+	vnic->dbfs.ssr_drop_stream = debugfs_create_u64
+		("ssr_drop_stream", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.ssr_drop_stream);
+	vnic->dbfs.ssr_misorder = debugfs_create_u64
+		("ssr_misorder", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.ssr_misorder);
+	vnic->dbfs.ssr_slow_start = debugfs_create_u64
+		("ssr_slow_start", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.ssr_slow_start);
+	vnic->dbfs.ssr_merges = debugfs_create_u64
+		("ssr_merges", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.ssr_merges);
+	vnic->dbfs.ssr_too_many = debugfs_create_u64
+		("ssr_too_many", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.ssr_too_many);
+	vnic->dbfs.ssr_new_stream = debugfs_create_u64
+		("ssr_new_stream", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.ssr_new_stream);
+
+	vnic->dbfs.fastpath_tx_busy = debugfs_create_u64
+		("fastpath_tx_busy", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.fastpath_tx_busy);
+	vnic->dbfs.fastpath_tx_completions = debugfs_create_u64
+		("fastpath_tx_completions", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.fastpath_tx_completions);
+	vnic->dbfs.fastpath_tx_pending_max = debugfs_create_u32
+		("fastpath_tx_pending_max", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.fastpath_tx_pending_max);
+	vnic->dbfs.event_count = debugfs_create_u64
+		("event_count", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.event_count);
+	vnic->dbfs.bad_event_count = debugfs_create_u64
+		("bad_event_count", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.bad_event_count);
+	vnic->dbfs.event_count_since_irq = debugfs_create_u32
+		("event_count_since_irq", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.event_count_since_irq);
+	vnic->dbfs.events_per_irq_max = debugfs_create_u32
+		("events_per_irq_max", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.events_per_irq_max);
+	vnic->dbfs.fastpath_frm_trunc = debugfs_create_u64
+		("fastpath_frm_trunc", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.fastpath_frm_trunc);
+	vnic->dbfs.fastpath_crc_bad = debugfs_create_u64
+		("fastpath_crc_bad", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.fastpath_crc_bad);
+	vnic->dbfs.fastpath_csum_bad = debugfs_create_u64
+		("fastpath_csum_bad", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.fastpath_csum_bad);
+	vnic->dbfs.fastpath_rights_bad = debugfs_create_u64
+		("fastpath_rights_bad", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.fastpath_rights_bad);
+	vnic->dbfs.fastpath_discard_other = debugfs_create_u64
+		("fastpath_discard_other", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.fastpath_discard_other);
+	vnic->dbfs.rx_no_desc_trunc = debugfs_create_u64
+		("rx_no_desc_trunc", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.rx_no_desc_trunc);
+	vnic->dbfs.events_per_poll_max = debugfs_create_u32
+		("events_per_poll_max", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.events_per_poll_max);
+	vnic->dbfs.events_per_poll_rx_max = debugfs_create_u32
+		("events_per_poll_rx_max", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.events_per_poll_rx_max);
+	vnic->dbfs.events_per_poll_tx_max = debugfs_create_u32
+		("events_per_poll_tx_max", S_IRUSR | S_IRGRP | S_IROTH,
+		 vnic->dbfs_dir, &vnic->stats.events_per_poll_tx_max);
+#endif
+#endif
+	return 0;
+}
+
+
+int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic)
+{
+#if defined(CONFIG_DEBUG_FS)
+	if (vnic->dbfs_dir != NULL) {
+		debugfs_remove(vnic->netdev_dbfs.fastpath_rx_pkts);
+		debugfs_remove(vnic->netdev_dbfs.fastpath_rx_bytes);
+		debugfs_remove(vnic->netdev_dbfs.fastpath_rx_errors);
+		debugfs_remove(vnic->netdev_dbfs.fastpath_tx_pkts);
+		debugfs_remove(vnic->netdev_dbfs.fastpath_tx_bytes);
+		debugfs_remove(vnic->netdev_dbfs.fastpath_tx_errors);
+		
+#if NETFRONT_ACCEL_STATS
+		debugfs_remove(vnic->dbfs.irq_count);
+		debugfs_remove(vnic->dbfs.useless_irq_count);
+		debugfs_remove(vnic->dbfs.poll_schedule_count);
+		debugfs_remove(vnic->dbfs.poll_call_count);
+		debugfs_remove(vnic->dbfs.poll_reschedule_count);
+		debugfs_remove(vnic->dbfs.queue_stops);
+		debugfs_remove(vnic->dbfs.queue_wakes);
+		debugfs_remove(vnic->dbfs.ssr_bursts);
+		debugfs_remove(vnic->dbfs.ssr_drop_stream);
+		debugfs_remove(vnic->dbfs.ssr_misorder);
+		debugfs_remove(vnic->dbfs.ssr_slow_start);
+		debugfs_remove(vnic->dbfs.ssr_merges);
+		debugfs_remove(vnic->dbfs.ssr_too_many);
+		debugfs_remove(vnic->dbfs.ssr_new_stream);
+		
+		debugfs_remove(vnic->dbfs.fastpath_tx_busy);
+		debugfs_remove(vnic->dbfs.fastpath_tx_completions);
+		debugfs_remove(vnic->dbfs.fastpath_tx_pending_max);
+		debugfs_remove(vnic->dbfs.event_count);
+		debugfs_remove(vnic->dbfs.bad_event_count);
+		debugfs_remove(vnic->dbfs.event_count_since_irq);
+		debugfs_remove(vnic->dbfs.events_per_irq_max);
+		debugfs_remove(vnic->dbfs.fastpath_frm_trunc);
+		debugfs_remove(vnic->dbfs.fastpath_crc_bad);
+		debugfs_remove(vnic->dbfs.fastpath_csum_bad);
+		debugfs_remove(vnic->dbfs.fastpath_rights_bad);
+		debugfs_remove(vnic->dbfs.fastpath_discard_other);
+		debugfs_remove(vnic->dbfs.rx_no_desc_trunc);
+		debugfs_remove(vnic->dbfs.events_per_poll_max);
+		debugfs_remove(vnic->dbfs.events_per_poll_rx_max);
+		debugfs_remove(vnic->dbfs.events_per_poll_tx_max);
+#endif
+		debugfs_remove(vnic->dbfs_dir);
+	}
+#endif
+	return 0;
+}
diff --git a/drivers/xen/sfc_netfront/accel_msg.c b/drivers/xen/sfc_netfront/accel_msg.c
new file mode 100644
index 0000000..045af8b
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_msg.c
@@ -0,0 +1,567 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+
+#include <xen/xenbus.h>
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+#include "accel_bufs.h"
+
+#include "netfront.h" /* drivers/xen/netfront/netfront.h */
+
+static void vnic_start_interrupts(netfront_accel_vnic *vnic)
+{
+	unsigned long flags;
+	
+	/* Prime our interrupt */
+	spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+	if (!netfront_accel_vi_enable_interrupts(vnic)) {
+		struct netfront_info *np = netdev_priv(vnic->net_dev);
+
+		/* Cripes, that was quick, better pass it up */
+		netfront_accel_disable_net_interrupts(vnic);
+		vnic->irq_enabled = 0;
+		NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++);
+		napi_schedule(&np->napi);
+	} else {
+		/*
+		 * Nothing yet, make sure we get interrupts through
+		 * back end 
+		 */
+		vnic->irq_enabled = 1;
+		netfront_accel_enable_net_interrupts(vnic);
+	}
+	spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+}
+
+
+static void vnic_stop_interrupts(netfront_accel_vnic *vnic)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+	netfront_accel_disable_net_interrupts(vnic);
+	vnic->irq_enabled = 0;
+	spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+}
+
+
+static void vnic_start_fastpath(netfront_accel_vnic *vnic)
+{
+	struct net_device *net_dev = vnic->net_dev;
+	struct netfront_info *np = netdev_priv(net_dev);
+	unsigned long flags;
+
+	DPRINTK("%s\n", __FUNCTION__);
+
+	spin_lock_irqsave(&vnic->tx_lock, flags);
+	vnic->tx_enabled = 1;
+	spin_unlock_irqrestore(&vnic->tx_lock, flags);
+	
+	napi_disable(&np->napi);
+	vnic->poll_enabled = 1;
+	napi_enable(&np->napi);
+	
+	vnic_start_interrupts(vnic);
+}
+
+
+void vnic_stop_fastpath(netfront_accel_vnic *vnic)
+{
+	struct net_device *net_dev = vnic->net_dev;
+	struct netfront_info *np = (struct netfront_info *)netdev_priv(net_dev);
+	unsigned long flags1, flags2;
+
+	DPRINTK("%s\n", __FUNCTION__);
+
+	vnic_stop_interrupts(vnic);
+	
+	spin_lock_irqsave(&vnic->tx_lock, flags1);
+	vnic->tx_enabled = 0;
+	spin_lock_irqsave(&np->tx_lock, flags2);
+	if (vnic->tx_skb != NULL) {
+		dev_kfree_skb_any(vnic->tx_skb);
+		vnic->tx_skb = NULL;
+		if (netfront_check_queue_ready(net_dev)) {
+			netif_wake_queue(net_dev);
+			NETFRONT_ACCEL_STATS_OP
+				(vnic->stats.queue_wakes++);
+		}
+	}
+	spin_unlock_irqrestore(&np->tx_lock, flags2);
+	spin_unlock_irqrestore(&vnic->tx_lock, flags1);
+	
+	/* Must prevent polls and hold lock to modify poll_enabled */
+	napi_disable(&np->napi);
+	spin_lock_irqsave(&vnic->irq_enabled_lock, flags1);
+	vnic->poll_enabled = 0;
+	spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags1);
+	napi_enable(&np->napi);
+}
+
+
+static void netfront_accel_interface_up(netfront_accel_vnic *vnic)
+{
+	if (!vnic->backend_netdev_up) {
+		vnic->backend_netdev_up = 1;
+		
+		if (vnic->frontend_ready)
+			vnic_start_fastpath(vnic);
+	}
+}
+
+
+static void netfront_accel_interface_down(netfront_accel_vnic *vnic)
+{
+	if (vnic->backend_netdev_up) {
+		vnic->backend_netdev_up = 0;
+		
+		if (vnic->frontend_ready)
+			vnic_stop_fastpath(vnic);
+	}
+}
+
+
+static int vnic_add_bufs(netfront_accel_vnic *vnic, 
+			 struct net_accel_msg *msg)
+{
+	int rc, offset;
+	struct netfront_accel_bufinfo *bufinfo;
+  
+	BUG_ON(msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ);
+
+	offset = msg->u.mapbufs.reqid;
+
+	if (offset < vnic->bufpages.max_pages - 
+	    (vnic->bufpages.max_pages / sfc_netfront_buffer_split)) {
+		bufinfo = vnic->rx_bufs;
+	} else
+		bufinfo = vnic->tx_bufs;
+
+	/* Queue up some Rx buffers to start things off. */
+	if ((rc = netfront_accel_add_bufs(&vnic->bufpages, bufinfo, msg)) == 0) {
+		netfront_accel_vi_add_bufs(vnic, bufinfo == vnic->rx_bufs);
+
+		if (offset + msg->u.mapbufs.pages == vnic->bufpages.max_pages) {
+			VPRINTK("%s: got all buffers back\n", __FUNCTION__);
+			vnic->frontend_ready = 1;
+			if (vnic->backend_netdev_up)
+				vnic_start_fastpath(vnic);
+		} else {
+			VPRINTK("%s: got buffers back %d %d\n", __FUNCTION__, 
+				offset, msg->u.mapbufs.pages);
+		}
+	}
+
+	return rc;
+}
+
+
+/* The largest [o] such that (1u << o) <= n.  Requires n > 0. */
+
+inline unsigned log2_le(unsigned long n) {
+	unsigned order = 1;
+	while ((1ul << order) <= n) ++order;
+	return (order - 1);
+}
+
+static int vnic_send_buffer_requests(netfront_accel_vnic *vnic,
+				     struct netfront_accel_bufpages *bufpages)
+{
+	int pages, offset, rc = 0, sent = 0;
+	struct net_accel_msg msg;
+
+	while (bufpages->page_reqs < bufpages->max_pages) {
+		offset = bufpages->page_reqs;
+
+		pages = pow2(log2_le(bufpages->max_pages - 
+				     bufpages->page_reqs));
+		pages = pages < NET_ACCEL_MSG_MAX_PAGE_REQ ? 
+			pages : NET_ACCEL_MSG_MAX_PAGE_REQ;
+
+		BUG_ON(offset < 0);
+		BUG_ON(pages <= 0);
+
+		rc = netfront_accel_buf_map_request(vnic->dev, bufpages,
+						    &msg, pages, offset);
+		if (rc == 0) {
+			rc = net_accel_msg_send(vnic->shared_page, 
+						&vnic->to_dom0, &msg);
+			if (rc < 0) {
+				VPRINTK("%s: queue full, stopping for now\n",
+					__FUNCTION__);
+				break;
+			}
+			sent++;
+		} else {
+			EPRINTK("%s: problem with grant, stopping for now\n",
+				__FUNCTION__);
+			break;
+		}
+
+		bufpages->page_reqs += pages;
+	}
+
+	if (sent)
+		net_accel_msg_notify(vnic->msg_channel_irq);
+
+	return rc;
+}
+
+
+/*
+ * In response to dom0 saying "my queue is full", we reply with this
+ * when it is no longer full
+ */
+inline void vnic_set_queue_not_full(netfront_accel_vnic *vnic)
+{
+
+	if (test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B,
+			    (unsigned long *)&vnic->shared_page->aflags))
+		notify_remote_via_irq(vnic->msg_channel_irq);
+	else
+		VPRINTK("queue not full bit already set, not signalling\n");
+}
+
+/* 
+ * Notify dom0 that the queue we want to use is full, it should
+ * respond by setting MSG_AFLAGS_QUEUEUNOTFULL in due course
+ */
+inline void vnic_set_queue_full(netfront_accel_vnic *vnic)
+{
+
+	if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B,
+			     (unsigned long *)&vnic->shared_page->aflags))
+		notify_remote_via_irq(vnic->msg_channel_irq);
+	else
+		VPRINTK("queue full bit already set, not signalling\n");
+}
+
+
+static int vnic_check_hello_version(unsigned version) 
+{
+	if (version > NET_ACCEL_MSG_VERSION) {
+		/* Newer protocol, we must refuse */
+		return -EPROTO;
+	}
+
+	if (version < NET_ACCEL_MSG_VERSION) {
+		/*
+		 * We are newer, so have discretion to accept if we
+		 * wish.  For now however, just reject
+		 */
+		return -EPROTO;
+	}
+
+	BUG_ON(version != NET_ACCEL_MSG_VERSION);
+	return 0;
+}
+
+
+static int vnic_process_hello_msg(netfront_accel_vnic *vnic,
+				  struct net_accel_msg *msg)
+{
+	int err = 0;
+	unsigned pages = sfc_netfront_max_pages;
+
+	if (vnic_check_hello_version(msg->u.hello.version) < 0) {
+		msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY 
+			| NET_ACCEL_MSG_ERROR;
+		msg->u.hello.version = NET_ACCEL_MSG_VERSION;
+	} else {
+		vnic->backend_netdev_up
+			= vnic->shared_page->net_dev_up;
+		
+		msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY;
+		msg->u.hello.version = NET_ACCEL_MSG_VERSION;
+		if (msg->u.hello.max_pages &&
+		    msg->u.hello.max_pages < pages)
+			pages = msg->u.hello.max_pages;
+		msg->u.hello.max_pages = pages;
+		
+		/* Half of pages for rx, half for tx */ 
+		err = netfront_accel_alloc_buffer_mem(&vnic->bufpages,
+						      vnic->rx_bufs, 
+						      vnic->tx_bufs,
+						      pages);
+		if (err)
+			msg->id |= NET_ACCEL_MSG_ERROR;		
+	}
+	
+	/* Send reply */
+	net_accel_msg_reply_notify(vnic->shared_page, vnic->msg_channel_irq,
+				   &vnic->to_dom0, msg);
+	return err;
+}
+
+
+static int vnic_process_localmac_msg(netfront_accel_vnic *vnic,
+				     struct net_accel_msg *msg)
+{
+	unsigned long flags;
+	cuckoo_hash_mac_key key;
+
+	if (msg->u.localmac.flags & NET_ACCEL_MSG_ADD) {
+		DPRINTK("MAC has moved, could be local: %pM\n",
+			msg->u.localmac.mac);
+		key = cuckoo_mac_to_key(msg->u.localmac.mac);
+		spin_lock_irqsave(&vnic->table_lock, flags);
+		/* Try to remove it, not a big deal if not there */
+		cuckoo_hash_remove(&vnic->fastpath_table, 
+				   (cuckoo_hash_key *)&key);
+		spin_unlock_irqrestore(&vnic->table_lock, flags);
+	}
+	
+	return 0;
+}
+
+
+static 
+int vnic_process_rx_msg(netfront_accel_vnic *vnic,
+			struct net_accel_msg *msg)
+{
+	int err;
+
+	switch (msg->id) {
+	case NET_ACCEL_MSG_HELLO:
+		/* Hello, reply with Reply */
+		DPRINTK("got Hello, with version %.8x\n",
+			msg->u.hello.version);
+		BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_NONE);
+		err = vnic_process_hello_msg(vnic, msg);
+		if (err == 0)
+			vnic->msg_state = NETFRONT_ACCEL_MSG_HELLO;
+		break;
+	case NET_ACCEL_MSG_SETHW:
+		/* Hardware info message */
+		DPRINTK("got H/W info\n");
+		BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HELLO);
+		err = netfront_accel_vi_init(vnic, &msg->u.hw);
+		if (err == 0)
+			vnic->msg_state = NETFRONT_ACCEL_MSG_HW;
+		break;
+	case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY:
+		VPRINTK("Got mapped buffers back\n");
+		BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
+		err = vnic_add_bufs(vnic, msg);
+		break;
+	case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_ERROR:
+		/* No buffers.  Can't use the fast path. */
+		EPRINTK("Got mapped buffers error.  Cannot accelerate.\n");
+		BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
+		err = -EIO;
+		break;
+	case NET_ACCEL_MSG_LOCALMAC:
+		/* Should be add, remove not currently used */
+		EPRINTK_ON(!(msg->u.localmac.flags & NET_ACCEL_MSG_ADD));
+		BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
+		err = vnic_process_localmac_msg(vnic, msg);
+		break;
+	default:
+		EPRINTK("Huh? Message code is 0x%x\n", msg->id);
+		err = -EPROTO;
+		break;
+	}
+
+	return err;
+}
+
+
+/* Process an IRQ received from back end driver */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+void netfront_accel_msg_from_bend(struct work_struct *context)
+#else
+void netfront_accel_msg_from_bend(void *context)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+	netfront_accel_vnic *vnic = 
+		container_of(context, netfront_accel_vnic, msg_from_bend);
+#else
+	netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
+#endif
+	struct net_accel_msg msg;
+	int err, queue_was_full = 0;
+	
+	mutex_lock(&vnic->vnic_mutex);
+
+	/*
+	 * This happens when the shared pages have been unmapped but
+	 * the workqueue has yet to be flushed 
+	 */
+	if (!vnic->dom0_state_is_setup) 
+		goto unlock_out;
+
+	while ((vnic->shared_page->aflags & NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK)
+	       != 0) {
+		if (vnic->shared_page->aflags &
+		    NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL) {
+			/* We've been told there may now be space. */
+			clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B,
+				  (unsigned long *)&vnic->shared_page->aflags);
+		}
+
+		if (vnic->shared_page->aflags &
+		    NET_ACCEL_MSG_AFLAGS_QUEUE0FULL) {
+			/*
+			 * There will be space at the end of this
+			 * function if we can make any.
+			 */
+			clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B,
+				  (unsigned long *)&vnic->shared_page->aflags);
+			queue_was_full = 1;
+		}
+
+		if (vnic->shared_page->aflags &
+		    NET_ACCEL_MSG_AFLAGS_NETUPDOWN) {
+			DPRINTK("%s: net interface change\n", __FUNCTION__);
+			clear_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B,
+				  (unsigned long *)&vnic->shared_page->aflags);
+			if (vnic->shared_page->net_dev_up)
+				netfront_accel_interface_up(vnic);
+			else
+				netfront_accel_interface_down(vnic);
+		}
+	}
+
+	/* Pull msg out of shared memory */
+	while ((err = net_accel_msg_recv(vnic->shared_page, &vnic->from_dom0,
+					 &msg)) == 0) {
+		err = vnic_process_rx_msg(vnic, &msg);
+		
+		if (err != 0)
+			goto done;
+	}
+
+	/*
+	 * Send any pending buffer map request messages that we can,
+	 * and mark domU->dom0 as full if necessary.  
+	 */
+	if (vnic->msg_state == NETFRONT_ACCEL_MSG_HW &&
+	    vnic->bufpages.page_reqs < vnic->bufpages.max_pages) {
+		if (vnic_send_buffer_requests(vnic, &vnic->bufpages) == -ENOSPC)
+			vnic_set_queue_full(vnic);
+	}
+
+	/* 
+	 * If there are no messages then this is not an error.  It
+	 * just means that we've finished processing the queue.
+	 */
+	if (err == -ENOENT)
+		err = 0;
+ done:
+	/* We will now have made space in the dom0->domU queue if we can */
+	if (queue_was_full)
+		vnic_set_queue_not_full(vnic);
+
+	if (err != 0) {
+		EPRINTK("%s returned %d\n", __FUNCTION__, err);
+		netfront_accel_set_closing(vnic);
+	}
+
+ unlock_out:
+	mutex_unlock(&vnic->vnic_mutex);
+
+	return;
+}
+
+
+irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context)
+{
+	netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
+	VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename);
+
+	queue_work(netfront_accel_workqueue, &vnic->msg_from_bend);
+
+	return IRQ_HANDLED;
+}
+
+/* Process an interrupt received from the NIC via backend */
+irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context)
+{
+	netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
+	struct net_device *net_dev = vnic->net_dev;
+	unsigned long flags;
+
+	VPRINTK("net irq %d from device %s\n", irq, vnic->dev->nodename);
+	
+	NETFRONT_ACCEL_STATS_OP(vnic->stats.irq_count++);
+
+	BUG_ON(net_dev==NULL);
+
+	spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+	if (vnic->irq_enabled) {
+		struct netfront_info *np = netdev_priv(net_dev);
+
+		netfront_accel_disable_net_interrupts(vnic);
+		vnic->irq_enabled = 0;
+		spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+
+#if NETFRONT_ACCEL_STATS
+		vnic->stats.poll_schedule_count++;
+		if (vnic->stats.event_count_since_irq >
+		    vnic->stats.events_per_irq_max)
+			vnic->stats.events_per_irq_max = 
+				vnic->stats.event_count_since_irq;
+		vnic->stats.event_count_since_irq = 0;
+#endif
+		napi_schedule(&np->napi);
+	}
+	else {
+		spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+		NETFRONT_ACCEL_STATS_OP(vnic->stats.useless_irq_count++);
+		DPRINTK("%s: irq when disabled\n", __FUNCTION__);
+	}
+	
+	return IRQ_HANDLED;
+}
+
+
+void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac,
+				    u32 ip, u16 port, u8 protocol)
+{
+	unsigned long lock_state;
+	struct net_accel_msg *msg;
+
+	msg = net_accel_msg_start_send(vnic->shared_page, &vnic->to_dom0,
+				       &lock_state);
+
+	if (msg == NULL)
+		return;
+
+	net_accel_msg_init(msg, NET_ACCEL_MSG_FASTPATH);
+	msg->u.fastpath.flags = NET_ACCEL_MSG_REMOVE;
+	memcpy(msg->u.fastpath.mac, mac, ETH_ALEN);
+
+	msg->u.fastpath.port = port;
+	msg->u.fastpath.ip = ip;
+	msg->u.fastpath.proto = protocol;
+
+	net_accel_msg_complete_send_notify(vnic->shared_page, &vnic->to_dom0, 
+					   &lock_state, vnic->msg_channel_irq);
+}
diff --git a/drivers/xen/sfc_netfront/accel_netfront.c b/drivers/xen/sfc_netfront/accel_netfront.c
new file mode 100644
index 0000000..8ddec93
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_netfront.c
@@ -0,0 +1,330 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+/* drivers/xen/netfront/netfront.h */
+#include "netfront.h"
+
+#include "accel.h"
+#include "accel_bufs.h"
+#include "accel_util.h"
+#include "accel_msg_iface.h"
+#include "accel_ssr.h"
+ 
+#ifdef EFX_GCOV
+#include "gcov.h"
+#endif
+
+#define NETFRONT_ACCEL_VNIC_FROM_NETDEV(_nd)				\
+	((netfront_accel_vnic *)((struct netfront_info *)netdev_priv(net_dev))->accel_priv)
+
+static int netfront_accel_netdev_start_xmit(struct sk_buff *skb,
+					    struct net_device *net_dev)
+{
+	netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+	struct netfront_info *np = 
+		(struct netfront_info *)netdev_priv(net_dev);
+	int handled, rc;
+	unsigned long flags1, flags2;
+
+	BUG_ON(vnic == NULL);
+
+	/* Take our tx lock and hold for the duration */
+	spin_lock_irqsave(&vnic->tx_lock, flags1);
+
+	if (!vnic->tx_enabled) {
+		rc = 0;
+		goto unlock_out;
+	}
+
+	handled = netfront_accel_vi_tx_post(vnic, skb);
+	if (handled == NETFRONT_ACCEL_STATUS_BUSY) {
+		BUG_ON(vnic->net_dev != net_dev);
+		DPRINTK("%s stopping queue\n", __FUNCTION__);
+
+		/* Need netfront's tx_lock and vnic tx_lock to write tx_skb */
+		spin_lock_irqsave(&np->tx_lock, flags2);
+		BUG_ON(vnic->tx_skb != NULL);
+		vnic->tx_skb = skb;
+		netif_stop_queue(net_dev);
+		spin_unlock_irqrestore(&np->tx_lock, flags2);
+
+		NETFRONT_ACCEL_STATS_OP(vnic->stats.queue_stops++);
+	}
+
+	if (handled == NETFRONT_ACCEL_STATUS_CANT)
+		rc = 0;
+	else
+		rc = 1;
+
+unlock_out:
+	spin_unlock_irqrestore(&vnic->tx_lock, flags1);
+
+	return rc;
+}
+
+
+static int netfront_accel_netdev_poll(struct net_device *net_dev, int *budget)
+{
+	netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+	int rx_allowed = *budget, rx_done;
+	
+	BUG_ON(vnic == NULL);
+
+	/* Can check this without lock as modifier excludes polls */ 
+	if (!vnic->poll_enabled)
+		return 0;
+
+	rx_done = netfront_accel_vi_poll(vnic, rx_allowed);
+	*budget -= rx_done;
+	
+	NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_call_count++);
+
+	VPRINTK("%s: done %d allowed %d\n",
+		__FUNCTION__, rx_done, rx_allowed);
+
+	netfront_accel_ssr_end_of_burst(vnic, &vnic->ssr_state);
+
+	if (rx_done < rx_allowed) {
+		 return 0; /* Done */
+	}
+	
+	NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_reschedule_count++);
+
+	return 1; /* More to do. */
+}
+
+
+/*
+ * Process request from netfront to start napi interrupt
+ * mode. (i.e. enable interrupts as it's finished polling)
+ */
+static int netfront_accel_start_napi_interrupts(struct net_device *net_dev) 
+{
+	netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+	unsigned long flags;
+
+	BUG_ON(vnic == NULL);
+	
+	/*
+	 * Can check this without lock as writer excludes poll before
+	 * modifying
+	 */
+	if (!vnic->poll_enabled)
+		return 0;
+
+	if (!netfront_accel_vi_enable_interrupts(vnic)) {
+		/* 
+		 * There was something there, tell caller we had
+		 * something to do.
+		 */
+		return 1;
+	}
+
+	spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+	vnic->irq_enabled = 1;
+	netfront_accel_enable_net_interrupts(vnic);
+	spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+
+	return 0;
+}
+
+
+/*
+ * Process request from netfront to stop napi interrupt
+ * mode. (i.e. disable interrupts as it's starting to poll 
+ */
+static void netfront_accel_stop_napi_interrupts(struct net_device *net_dev) 
+{
+	netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+	unsigned long flags;
+
+	BUG_ON(vnic == NULL);
+
+	spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+
+	if (!vnic->poll_enabled) {
+		spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+		return;
+	}
+
+	netfront_accel_disable_net_interrupts(vnic);
+	vnic->irq_enabled = 0;
+	spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+}
+
+
+static int netfront_accel_check_ready(struct net_device *net_dev)
+{
+	netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+
+	BUG_ON(vnic == NULL);
+
+	/* Read of tx_skb is protected by netfront's tx_lock */ 
+	return vnic->tx_skb == NULL;
+}
+
+
+static int netfront_accel_get_stats(struct net_device *net_dev,
+				    struct net_device_stats *devst,
+				    struct netfront_stats *lnkst)
+{
+	netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+	struct netfront_accel_netdev_stats now;
+
+	BUG_ON(vnic == NULL);
+
+	now.fastpath_rx_pkts   = vnic->netdev_stats.fastpath_rx_pkts;
+	now.fastpath_rx_bytes  = vnic->netdev_stats.fastpath_rx_bytes;
+	now.fastpath_rx_errors = vnic->netdev_stats.fastpath_rx_errors;
+	now.fastpath_tx_pkts   = vnic->netdev_stats.fastpath_tx_pkts;
+	now.fastpath_tx_bytes  = vnic->netdev_stats.fastpath_tx_bytes;
+	now.fastpath_tx_errors = vnic->netdev_stats.fastpath_tx_errors;
+	
+	lnkst->rx_packets += (now.fastpath_rx_pkts -
+			      vnic->stats_last_read.fastpath_rx_pkts);
+	lnkst->rx_bytes   += (now.fastpath_rx_bytes -
+			      vnic->stats_last_read.fastpath_rx_bytes);
+	devst->rx_errors  += (now.fastpath_rx_errors -
+			      vnic->stats_last_read.fastpath_rx_errors);
+	lnkst->tx_packets += (now.fastpath_tx_pkts -
+			      vnic->stats_last_read.fastpath_tx_pkts);
+	lnkst->tx_bytes   += (now.fastpath_tx_bytes -
+			      vnic->stats_last_read.fastpath_tx_bytes);
+	devst->tx_errors  += (now.fastpath_tx_errors -
+			      vnic->stats_last_read.fastpath_tx_errors);
+	
+	vnic->stats_last_read = now;
+
+	return 0;
+}
+
+
+struct netfront_accel_hooks accel_hooks = {
+	.new_device	    = &netfront_accel_probe,
+	.remove		= &netfront_accel_remove,
+	.netdev_poll	   = &netfront_accel_netdev_poll,
+	.start_xmit	    = &netfront_accel_netdev_start_xmit,
+	.start_napi_irq	= &netfront_accel_start_napi_interrupts,
+	.stop_napi_irq	 = &netfront_accel_stop_napi_interrupts,
+	.check_ready	   = &netfront_accel_check_ready,
+	.get_stats	     = &netfront_accel_get_stats
+};
+
+
+unsigned sfc_netfront_max_pages = NETFRONT_ACCEL_DEFAULT_BUF_PAGES;
+module_param_named (max_pages, sfc_netfront_max_pages, uint, 0644);
+MODULE_PARM_DESC(max_pages, "Number of buffer pages to request");
+
+unsigned sfc_netfront_buffer_split = 2;
+module_param_named (buffer_split, sfc_netfront_buffer_split, uint, 0644);
+MODULE_PARM_DESC(buffer_split, 
+		 "Fraction of buffers to use for TX, rest for RX");
+
+
+const char *frontend_name = "sfc_netfront";
+
+struct workqueue_struct *netfront_accel_workqueue;
+
+static int __init netfront_accel_init(void)
+{
+	int rc;
+#ifdef EFX_GCOV	
+	gcov_provider_init(THIS_MODULE);
+#endif
+
+	/*
+	 * If we're running on dom0, netfront hasn't initialised
+	 * itself, so we need to keep away
+	 */
+	if (is_initial_xendomain())
+		return 0;
+
+	if (!is_pow2(sizeof(struct net_accel_msg)))
+		EPRINTK("%s: bad structure size\n", __FUNCTION__);
+
+	netfront_accel_workqueue = create_workqueue(frontend_name);
+
+	netfront_accel_debugfs_init();
+
+	rc = netfront_accelerator_loaded(NETFRONT_ACCEL_VERSION,
+					 frontend_name, &accel_hooks);
+
+	if (rc < 0) {
+		EPRINTK("Xen netfront accelerator version mismatch\n");
+		goto fail;
+	}
+
+	if (rc > 0) {
+		/* 
+		 * In future may want to add backwards compatibility
+		 * and accept certain subsets of previous versions
+		 */
+		EPRINTK("Xen netfront accelerator version mismatch\n");
+		goto fail;
+	}
+
+	return 0;
+
+ fail:
+	netfront_accel_debugfs_fini();
+	flush_workqueue(netfront_accel_workqueue);
+	destroy_workqueue(netfront_accel_workqueue);
+#ifdef EFX_GCOV
+ 	gcov_provider_fini(THIS_MODULE);
+#endif
+	return -EINVAL;
+}
+module_init(netfront_accel_init);
+
+static void __exit netfront_accel_exit(void)
+{
+	if (is_initial_xendomain())
+		return;
+
+	DPRINTK("%s: unhooking\n", __FUNCTION__);
+
+	/* Unhook from normal netfront */
+	netfront_accelerator_stop(frontend_name);
+
+	DPRINTK("%s: done\n", __FUNCTION__);
+
+	netfront_accel_debugfs_fini();
+
+	flush_workqueue(netfront_accel_workqueue);
+
+	destroy_workqueue(netfront_accel_workqueue);
+
+#ifdef EFX_GCOV
+ 	gcov_provider_fini(THIS_MODULE);
+#endif
+	return;
+}
+module_exit(netfront_accel_exit);
+
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/xen/sfc_netfront/accel_ssr.c b/drivers/xen/sfc_netfront/accel_ssr.c
new file mode 100644
index 0000000..9c44144
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_ssr.c
@@ -0,0 +1,308 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/list.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+
+#include "accel.h"
+#include "accel_util.h"
+#include "accel_bufs.h"
+
+#include "accel_ssr.h"
+
+static inline int list_valid(struct list_head *lh) {
+	return(lh->next != NULL);
+}
+
+static void netfront_accel_ssr_deliver (struct netfront_accel_vnic *vnic,
+					struct netfront_accel_ssr_state *st,
+					struct netfront_accel_ssr_conn *c);
+
+/** Construct an efx_ssr_state.
+ *
+ * @v st     The SSR state (per channel per port)
+ * @v port   The port.
+ */
+void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st) {
+	unsigned i;
+
+	INIT_LIST_HEAD(&st->conns);
+	INIT_LIST_HEAD(&st->free_conns);
+	for (i = 0; i < 8; ++i) {
+		struct netfront_accel_ssr_conn *c = 
+			kmalloc(sizeof(*c), GFP_KERNEL);
+		if (c == NULL)  break;
+		c->n_in_order_pkts = 0;
+		c->skb = NULL;
+		list_add(&c->link, &st->free_conns);
+	}
+
+}
+
+
+/** Destructor for an efx_ssr_state.
+ *
+ * @v st     The SSR state (per channel per port)
+ */
+void netfront_accel_ssr_fini(netfront_accel_vnic *vnic, 
+			     struct netfront_accel_ssr_state *st) {
+	struct netfront_accel_ssr_conn *c;
+
+	/* Return cleanly if efx_ssr_init() not previously called */
+	BUG_ON(list_valid(&st->conns) != list_valid(&st->free_conns));
+	if (! list_valid(&st->conns))
+		return;
+
+	while ( ! list_empty(&st->free_conns)) {
+		c = list_entry(st->free_conns.prev, 
+			       struct netfront_accel_ssr_conn, link);
+		list_del(&c->link);
+		BUG_ON(c->skb != NULL);
+		kfree(c);
+	}
+	while ( ! list_empty(&st->conns)) {
+		c = list_entry(st->conns.prev, 
+			       struct netfront_accel_ssr_conn, link);
+		list_del(&c->link);
+		if (c->skb)
+			netfront_accel_ssr_deliver(vnic, st, c);
+		kfree(c);
+	}
+}
+
+
+/** Calc IP checksum and deliver to the OS
+ *
+ * @v st     The SSR state (per channel per port)
+ * @v c	     The SSR connection state
+ */
+static void netfront_accel_ssr_deliver(netfront_accel_vnic *vnic,
+				       struct netfront_accel_ssr_state *st,
+				       struct netfront_accel_ssr_conn *c) {
+	BUG_ON(c->skb == NULL);
+
+	/*
+	 * If we've chained packets together, recalculate the IP
+	 * checksum.
+	 */
+	if (skb_shinfo(c->skb)->frag_list) {
+		NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_bursts);
+		c->iph->check = 0;
+		c->iph->check = ip_fast_csum((unsigned char *) c->iph, 
+					     c->iph->ihl);
+	}
+
+	VPRINTK("%s: %d\n", __FUNCTION__, c->skb->len);
+
+	netif_receive_skb(c->skb); 
+	c->skb = NULL;
+}
+
+
+/** Push held skbs down into network stack.
+ *
+ * @v st       SSR state
+ *
+ * Only called if we are tracking one or more connections.
+ */
+void __netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic, 
+				       struct netfront_accel_ssr_state *st) {
+	struct netfront_accel_ssr_conn *c;
+
+	BUG_ON(list_empty(&st->conns));
+
+	list_for_each_entry(c, &st->conns, link)
+		if (c->skb)
+			netfront_accel_ssr_deliver(vnic, st, c);
+
+	/* Time-out connections that have received no traffic for 20ms. */
+	c = list_entry(st->conns.prev, struct netfront_accel_ssr_conn,
+		       link);
+	if (jiffies - c->last_pkt_jiffies > (HZ / 50 + 1)) {
+		NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_drop_stream);
+		list_del(&c->link);
+		list_add(&c->link, &st->free_conns);
+	}
+}
+
+
+/** Process SKB and decide whether to dispatch it to the stack now or
+ * later.
+ *
+ * @v st	 SSR state
+ * @v skb	SKB to exmaine
+ * @ret rc       0 => deliver SKB to kernel now, otherwise the SKB belongs
+ *	       us.
+ */
+int netfront_accel_ssr_skb(struct netfront_accel_vnic *vnic,
+			   struct netfront_accel_ssr_state *st,
+			   struct sk_buff *skb) {
+	int data_length, dont_merge;
+	struct netfront_accel_ssr_conn *c;
+	struct iphdr *iph;
+	struct tcphdr *th;
+	unsigned th_seq;
+
+	BUG_ON(skb_shinfo(skb)->frag_list != NULL);
+	BUG_ON(skb->next != NULL);
+
+	/* We're not interested if it isn't TCP over IPv4. */
+	iph = (struct iphdr *) skb->data;
+	if (skb->protocol != htons(ETH_P_IP) ||
+	    iph->protocol != IPPROTO_TCP) {
+		return 0;
+	}
+
+	/* Ignore segments that fail csum or are fragmented. */
+	if (unlikely((skb->ip_summed - CHECKSUM_UNNECESSARY) |
+		     (iph->frag_off & htons(IP_MF | IP_OFFSET)))) {
+		return 0;
+	}
+
+	th = (struct tcphdr*)(skb->data + iph->ihl * 4);
+	data_length = ntohs(iph->tot_len) - iph->ihl * 4 - th->doff * 4;
+	th_seq = ntohl(th->seq);
+	dont_merge = (data_length == 0) | th->urg | th->syn | th->rst;
+
+	list_for_each_entry(c, &st->conns, link) {
+		if ((c->saddr  - iph->saddr) |
+		    (c->daddr  - iph->daddr) |
+		    (c->source - th->source) |
+		    (c->dest   - th->dest  ))
+			continue;
+
+		/* Re-insert at head of list to reduce lookup time. */
+		list_del(&c->link);
+		list_add(&c->link, &st->conns);
+		c->last_pkt_jiffies = jiffies;
+
+		if (unlikely(th_seq - c->next_seq)) {
+			/* Out-of-order, so start counting again. */
+			if (c->skb)
+				netfront_accel_ssr_deliver(vnic, st, c);
+			c->n_in_order_pkts = 0;
+			c->next_seq = th_seq + data_length;
+			NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_misorder);
+			return 0;
+		}
+		c->next_seq = th_seq + data_length;
+
+		if (++c->n_in_order_pkts < 300) {
+			/* May be in slow-start, so don't merge. */
+			NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_slow_start);
+			return 0;
+		}
+
+		if (unlikely(dont_merge)) {
+			if (c->skb)
+				netfront_accel_ssr_deliver(vnic, st, c);
+			return 0;
+		}
+
+		if (c->skb) {
+			c->iph->tot_len = ntohs(c->iph->tot_len);
+			c->iph->tot_len += data_length;
+			c->iph->tot_len = htons(c->iph->tot_len);
+			c->th->ack_seq = th->ack_seq;
+			c->th->fin |= th->fin;
+			c->th->psh |= th->psh;
+			c->th->window = th->window;
+
+			/* Remove the headers from this skb. */
+			skb_pull(skb, skb->len - data_length);
+
+			/*
+			 * Tack the new skb onto the head skb's frag_list.
+			 * This is exactly the format that fragmented IP
+			 * datagrams are reassembled into.
+			 */
+			BUG_ON(skb->next != 0);
+			if ( ! skb_shinfo(c->skb)->frag_list)
+				skb_shinfo(c->skb)->frag_list = skb;
+			else
+				c->skb_tail->next = skb;
+			c->skb_tail = skb;
+			c->skb->len += skb->len;
+			c->skb->data_len += skb->len;
+			c->skb->truesize += skb->truesize;
+
+			NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_merges);
+
+			/*
+			 * If the next packet might push this super-packet
+			 * over the limit for an IP packet, deliver it now.
+			 * This is slightly conservative, but close enough.
+			 */
+			if (c->skb->len + 
+			    (PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)
+			    > 16384)
+				netfront_accel_ssr_deliver(vnic, st, c);
+
+			return 1;
+		}
+		else {
+			c->iph = iph;
+			c->th = th;
+			c->skb = skb;
+			return 1;
+		}
+	}
+
+	/* We're not yet tracking this connection. */
+
+	if (dont_merge) {
+		return 0;
+	}
+
+	if (list_empty(&st->free_conns)) {
+		c = list_entry(st->conns.prev, 
+			       struct netfront_accel_ssr_conn,
+			       link);
+		if (c->skb) {
+			NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_too_many);
+			return 0;
+		}
+	}
+	else {
+		c = list_entry(st->free_conns.next,
+			       struct netfront_accel_ssr_conn,
+			       link);
+	}
+	list_del(&c->link);
+	list_add(&c->link, &st->conns);
+	c->saddr = iph->saddr;
+	c->daddr = iph->daddr;
+	c->source = th->source;
+	c->dest = th->dest;
+	c->next_seq = th_seq + data_length;
+	c->n_in_order_pkts = 0;
+	BUG_ON(c->skb != NULL);
+	NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_new_stream);
+	return 0;
+}
diff --git a/drivers/xen/sfc_netfront/accel_ssr.h b/drivers/xen/sfc_netfront/accel_ssr.h
new file mode 100644
index 0000000..1d10f46
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_ssr.h
@@ -0,0 +1,88 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_SSR_H
+#define NETFRONT_ACCEL_SSR_H
+
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/list.h>
+
+#include "accel.h"
+
+/** State for Soft Segment Reassembly (SSR). */
+
+struct netfront_accel_ssr_conn {
+	struct list_head link;
+
+	unsigned saddr, daddr;
+	unsigned short source, dest;
+
+	/** Number of in-order packets we've seen with payload. */
+	unsigned n_in_order_pkts;
+
+	/** Next in-order sequence number. */
+	unsigned next_seq;
+
+	/** Time we last saw a packet on this connection. */
+	unsigned long last_pkt_jiffies;
+
+	/** The SKB we are currently holding.  If NULL, then all following
+	 * fields are undefined.
+	 */
+	struct sk_buff *skb;
+
+	/** The tail of the frag_list of SKBs we're holding.  Only valid
+	 * after at least one merge.
+	 */
+	struct sk_buff *skb_tail;
+
+	/** The IP header of the skb we are holding. */
+	struct iphdr *iph;
+	
+	/** The TCP header of the skb we are holding. */
+	struct tcphdr *th;
+};
+
+extern void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st);
+extern void netfront_accel_ssr_fini(netfront_accel_vnic *vnic,
+				    struct netfront_accel_ssr_state *st);
+
+extern void
+__netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic,
+				  struct netfront_accel_ssr_state *st);
+
+extern int  netfront_accel_ssr_skb(netfront_accel_vnic *vnic,
+				   struct netfront_accel_ssr_state *st,
+				   struct sk_buff *skb);
+
+static inline void
+netfront_accel_ssr_end_of_burst (netfront_accel_vnic *vnic,
+				 struct netfront_accel_ssr_state *st) {
+	if ( ! list_empty(&st->conns) )
+		__netfront_accel_ssr_end_of_burst(vnic, st);
+}
+
+#endif /* NETFRONT_ACCEL_SSR_H */
diff --git a/drivers/xen/sfc_netfront/accel_tso.c b/drivers/xen/sfc_netfront/accel_tso.c
new file mode 100644
index 0000000..1133ebb
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_tso.c
@@ -0,0 +1,509 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/pci.h>
+#include <linux/tcp.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+
+#include "accel.h"
+#include "accel_util.h"
+
+#include "accel_tso.h"
+
+#define ETH_HDR_LEN(skb)  skb_network_offset(skb)
+#define SKB_TCP_OFF(skb)  skb_transport_offset(skb)
+#define SKB_IP_OFF(skb)   skb_network_offset(skb)
+
+/*
+ * Set a maximum number of buffers in each output packet to make life
+ * a little simpler - if this is reached it will just move on to
+ * another packet 
+ */
+#define ACCEL_TSO_MAX_BUFFERS (6)
+
+/** TSO State.
+ *
+ * The state used during segmentation.  It is put into this data structure
+ * just to make it easy to pass into inline functions.
+ */
+struct netfront_accel_tso_state {
+	/** bytes of data we've yet to segment */
+	unsigned remaining_len;
+
+	/** current sequence number */
+	unsigned seqnum;
+
+	/** remaining space in current packet */
+	unsigned packet_space;
+
+	/** List of packets to be output, containing the buffers and
+	 *  iovecs to describe each packet 
+	 */
+	struct netfront_accel_tso_output_packet *output_packets;
+
+	/** Total number of buffers in output_packets */
+	unsigned buffers;
+
+	/** Total number of packets in output_packets */
+	unsigned packets;
+
+	/** Input Fragment Cursor.
+	 *
+	 * Where we are in the current fragment of the incoming SKB.  These
+	 * values get updated in place when we split a fragment over
+	 * multiple packets.
+	 */
+	struct {
+		/** address of current position */
+		void *addr;
+		/** remaining length */   
+		unsigned int len;
+	} ifc; /*  == ifc Input Fragment Cursor */
+
+	/** Parameters.
+	 *
+	 * These values are set once at the start of the TSO send and do
+	 * not get changed as the routine progresses.
+	 */
+	struct {
+		/* the number of bytes of header */
+		unsigned int header_length;
+
+		/* The number of bytes to put in each outgoing segment. */
+		int full_packet_size;
+		
+		/* Current IP ID, host endian. */
+		unsigned ip_id;
+
+		/* Max size of each output packet payload */
+		int gso_size;
+	} p;
+};
+
+
+/**
+ * Verify that our various assumptions about sk_buffs and the conditions
+ * under which TSO will be attempted hold true.
+ *
+ * @v skb	       The sk_buff to check.
+ */
+static inline void tso_check_safe(struct sk_buff *skb) {
+	EPRINTK_ON(skb->protocol != htons (ETH_P_IP));
+	EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP));
+	EPRINTK_ON(ip_hdr(skb)->protocol != IPPROTO_TCP);
+	EPRINTK_ON((SKB_TCP_OFF(skb) + tcp_hdrlen(skb)) > skb_headlen(skb));
+}
+
+
+
+/** Parse the SKB header and initialise state. */
+static inline void tso_start(struct netfront_accel_tso_state *st, 
+			     struct sk_buff *skb) {
+
+	/*
+	 * All ethernet/IP/TCP headers combined size is TCP header size
+	 * plus offset of TCP header relative to start of packet.
+ 	 */
+	st->p.header_length = tcp_hdrlen(skb) + SKB_TCP_OFF(skb);
+	st->p.full_packet_size = (st->p.header_length
+				  + skb_shinfo(skb)->gso_size);
+	st->p.gso_size = skb_shinfo(skb)->gso_size;
+
+	st->p.ip_id = htons(ip_hdr(skb)->id);
+	st->seqnum = ntohl(tcp_hdr(skb)->seq);
+
+	EPRINTK_ON(tcp_hdr(skb)->urg);
+	EPRINTK_ON(tcp_hdr(skb)->syn);
+	EPRINTK_ON(tcp_hdr(skb)->rst);
+
+	st->remaining_len = skb->len - st->p.header_length;
+
+	st->output_packets = NULL;
+	st->buffers = 0;
+	st->packets = 0;
+
+	VPRINTK("Starting new TSO: hl %d ps %d gso %d seq %x len %d\n",
+		st->p.header_length, st->p.full_packet_size, st->p.gso_size,
+		st->seqnum, skb->len);
+}
+
+/**
+ * Add another NIC mapped buffer onto an output packet  
+ */ 
+static inline int tso_start_new_buffer(netfront_accel_vnic *vnic,
+				       struct netfront_accel_tso_state *st,
+				       int first)
+{
+	struct netfront_accel_tso_buffer *tso_buf;
+	struct netfront_accel_pkt_desc *buf;
+
+	/* Get a mapped packet buffer */
+	buf = netfront_accel_buf_get(vnic->tx_bufs);
+	if (buf == NULL) {
+		DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
+		return -1;
+	}
+
+	/* Store a bit of meta-data at the end */
+	tso_buf =(struct netfront_accel_tso_buffer *)
+		(buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH
+		 + sizeof(struct netfront_accel_tso_output_packet));
+
+	tso_buf->buf = buf;
+
+	tso_buf->length = 0;
+	
+	if (first) {
+		struct netfront_accel_tso_output_packet *output_packet 
+			= (struct netfront_accel_tso_output_packet *)
+			(buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH);
+		output_packet->next = st->output_packets;
+		st->output_packets = output_packet;
+		tso_buf->next = NULL;
+		st->output_packets->tso_bufs = tso_buf;
+		st->output_packets->tso_bufs_len = 1;
+	} else {
+		tso_buf->next = st->output_packets->tso_bufs;
+		st->output_packets->tso_bufs = tso_buf;
+		st->output_packets->tso_bufs_len ++;
+	}
+
+	BUG_ON(st->output_packets->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS);
+	
+	st->buffers ++;
+
+	/*
+	 * Store the context, set to NULL, last packet buffer will get
+	 * non-NULL later
+	 */
+	tso_buf->buf->skb = NULL;
+
+	return 0;
+}
+
+
+/* Generate a new header, and prepare for the new packet.
+ *
+ * @v vnic	      VNIC
+ * @v skb	       Socket buffer
+ * @v st		TSO state
+ * @ret rc	      0 on success, or -1 if failed to alloc header
+ */
+
+static inline 
+int tso_start_new_packet(netfront_accel_vnic *vnic,
+			 struct sk_buff *skb,
+			 struct netfront_accel_tso_state *st) 
+{
+	struct netfront_accel_tso_buffer *tso_buf;
+	struct iphdr *tsoh_iph;
+	struct tcphdr *tsoh_th;
+	unsigned ip_length;
+
+	if (tso_start_new_buffer(vnic, st, 1) < 0) {
+		NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+		return -1;		
+	}
+
+	/* This has been set up by tso_start_new_buffer() */
+	tso_buf = st->output_packets->tso_bufs;
+
+	/* Copy in the header */
+	memcpy(tso_buf->buf->pkt_kva, skb->data, st->p.header_length);
+	tso_buf->length = st->p.header_length;
+
+	tsoh_th = (struct tcphdr*) 
+		(tso_buf->buf->pkt_kva + SKB_TCP_OFF(skb));
+	tsoh_iph = (struct iphdr*) 
+		(tso_buf->buf->pkt_kva + SKB_IP_OFF(skb));
+
+	/* Set to zero to encourage falcon to fill these in */
+	tsoh_th->check  = 0;
+	tsoh_iph->check = 0;
+
+	tsoh_th->seq = htonl(st->seqnum);
+	st->seqnum += st->p.gso_size;
+
+	if (st->remaining_len > st->p.gso_size) {
+		/* This packet will not finish the TSO burst. */
+		ip_length = st->p.full_packet_size - ETH_HDR_LEN(skb);
+		tsoh_th->fin = 0;
+		tsoh_th->psh = 0;
+	} else {
+		/* This packet will be the last in the TSO burst. */
+		ip_length = (st->p.header_length - ETH_HDR_LEN(skb)
+			     + st->remaining_len);
+		tsoh_th->fin = tcp_hdr(skb)->fin;
+		tsoh_th->psh = tcp_hdr(skb)->psh;
+	}
+
+	tsoh_iph->tot_len = htons(ip_length);
+
+	/* Linux leaves suitable gaps in the IP ID space for us to fill. */
+	tsoh_iph->id = st->p.ip_id++;
+	tsoh_iph->id = htons(tsoh_iph->id);
+
+	st->packet_space = st->p.gso_size; 
+
+	st->packets++;
+
+	return 0;
+}
+
+
+
+static inline void tso_get_fragment(struct netfront_accel_tso_state *st, 
+				    int len, void *addr)
+{
+	st->ifc.len = len;
+	st->ifc.addr = addr;
+	return;
+}
+
+
+static inline void tso_unwind(netfront_accel_vnic *vnic, 
+			      struct netfront_accel_tso_state *st)
+{
+	struct netfront_accel_tso_buffer *tso_buf;
+	struct netfront_accel_tso_output_packet *output_packet;
+
+	DPRINTK("%s\n", __FUNCTION__);
+
+	while (st->output_packets != NULL) {
+		output_packet = st->output_packets;
+		st->output_packets = output_packet->next;
+		while (output_packet->tso_bufs != NULL) {
+			tso_buf = output_packet->tso_bufs;
+			output_packet->tso_bufs = tso_buf->next;
+
+			st->buffers --;
+			output_packet->tso_bufs_len --;
+
+			netfront_accel_buf_put(vnic->tx_bufs, 
+					       tso_buf->buf->buf_id);
+		}
+	}
+	BUG_ON(st->buffers != 0);
+}
+
+
+
+static inline
+void tso_fill_packet_with_fragment(netfront_accel_vnic *vnic,
+				   struct netfront_accel_tso_state *st) 
+{
+	struct netfront_accel_tso_buffer *tso_buf;
+	int n, space;
+
+	BUG_ON(st->output_packets == NULL);
+	BUG_ON(st->output_packets->tso_bufs == NULL);
+
+	tso_buf = st->output_packets->tso_bufs;
+
+	if (st->ifc.len == 0)  return;
+	if (st->packet_space == 0)  return;
+	if (tso_buf->length == NETFRONT_ACCEL_TSO_BUF_LENGTH) return;
+
+	n = min(st->ifc.len, st->packet_space);
+
+	space = NETFRONT_ACCEL_TSO_BUF_LENGTH - tso_buf->length;
+	n = min(n, space);
+
+	st->packet_space -= n;
+	st->remaining_len -= n;
+	st->ifc.len -= n;
+
+	memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n);
+
+	tso_buf->length += n;
+
+	BUG_ON(tso_buf->length > NETFRONT_ACCEL_TSO_BUF_LENGTH);
+
+	st->ifc.addr += n;
+
+	return;
+}
+
+
+int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic,
+				   struct sk_buff *skb)
+{
+	struct netfront_accel_tso_state state;
+	struct netfront_accel_tso_buffer *tso_buf = NULL;
+	struct netfront_accel_tso_output_packet *reversed_list = NULL;
+	struct netfront_accel_tso_output_packet	*tmp_pkt;
+	ef_iovec iovecs[ACCEL_TSO_MAX_BUFFERS];
+	int frag_i, rc, dma_id;
+	skb_frag_t *f;
+
+	tso_check_safe(skb);
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		EPRINTK("Trying to TSO send a packet without HW checksum\n");
+
+	tso_start(&state, skb);
+
+	/*
+	 * Setup the first payload fragment.  If the skb header area
+	 * contains exactly the headers and all payload is in the frag
+	 * list things are little simpler
+	 */
+	if (skb_headlen(skb) == state.p.header_length) {
+		/* Grab the first payload fragment. */
+		BUG_ON(skb_shinfo(skb)->nr_frags < 1);
+		frag_i = 0;
+		f = &skb_shinfo(skb)->frags[frag_i];
+		tso_get_fragment(&state, skb_frag_size(f),
+				 page_address(skb_frag_page(f)) + f->page_offset);
+	} else {
+		int hl = state.p.header_length;
+		tso_get_fragment(&state,  skb_headlen(skb) - hl, 
+				 skb->data + hl);
+		frag_i = -1;
+	}
+
+	if (tso_start_new_packet(vnic, skb, &state) < 0) {
+		DPRINTK("%s: out of first start-packet memory\n",
+			__FUNCTION__);
+		goto unwind;
+	}
+
+	while (1) {
+		tso_fill_packet_with_fragment(vnic, &state);
+		
+		/* Move onto the next fragment? */
+		if (state.ifc.len == 0) {
+			if (++frag_i >= skb_shinfo(skb)->nr_frags)
+				/* End of payload reached. */
+				break;
+			f = &skb_shinfo(skb)->frags[frag_i];
+			tso_get_fragment(&state, skb_frag_size(f),
+					 page_address(skb_frag_page(f)) +
+					 f->page_offset);
+		}
+
+		/* Start a new buffer? */
+		if ((state.output_packets->tso_bufs->length == 
+		     NETFRONT_ACCEL_TSO_BUF_LENGTH) &&
+		    tso_start_new_buffer(vnic, &state, 0)) {
+			DPRINTK("%s: out of start-buffer memory\n",
+				__FUNCTION__);
+			goto unwind;
+		}
+
+		/* Start at new packet? */
+		if ((state.packet_space == 0 || 
+		     ((state.output_packets->tso_bufs_len >=
+		       ACCEL_TSO_MAX_BUFFERS) &&
+		      (state.output_packets->tso_bufs->length >= 
+		       NETFRONT_ACCEL_TSO_BUF_LENGTH))) &&
+		    tso_start_new_packet(vnic, skb, &state) < 0) {
+			DPRINTK("%s: out of start-packet memory\n",
+				__FUNCTION__);
+			goto unwind;
+		}
+
+	}
+
+	/* Check for space */
+	if (ef_vi_transmit_space(&vnic->vi) < state.buffers) {
+		DPRINTK("%s: Not enough TX space (%d)\n",
+			__FUNCTION__, state.buffers);
+		goto unwind;
+	}
+
+	/*
+	 * Store the skb context in the most recent buffer (i.e. the
+	 * last buffer that will be sent)
+	 */
+	state.output_packets->tso_bufs->buf->skb = skb;
+
+	/* Reverse the list of packets as we construct it on a stack */
+	while (state.output_packets != NULL) {
+		tmp_pkt = state.output_packets;
+		state.output_packets = tmp_pkt->next;
+		tmp_pkt->next = reversed_list;
+		reversed_list = tmp_pkt;
+	}
+
+	/* Pass off to hardware */
+	while (reversed_list != NULL) {
+		tmp_pkt = reversed_list;
+		reversed_list = tmp_pkt->next;
+
+		BUG_ON(tmp_pkt->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS);
+		BUG_ON(tmp_pkt->tso_bufs_len == 0);
+
+		dma_id = tmp_pkt->tso_bufs->buf->buf_id;
+
+		/*
+		 * Make an iovec of the buffers in the list, reversing
+		 * the buffers as we go as they are constructed on a
+		 * stack
+		 */
+		tso_buf = tmp_pkt->tso_bufs;
+		for (frag_i = tmp_pkt->tso_bufs_len - 1;
+		     frag_i >= 0;
+		     frag_i--) {
+			iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr;
+			iovecs[frag_i].iov_len = tso_buf->length;
+			tso_buf = tso_buf->next;
+		}
+
+		rc = ef_vi_transmitv(&vnic->vi, iovecs, tmp_pkt->tso_bufs_len,
+				     dma_id);
+		/*
+		 * We checked for space already, so it really should
+		 * succeed
+		 */
+		BUG_ON(rc != 0);
+	}
+
+	/* Track number of tx fastpath stats */
+	vnic->netdev_stats.fastpath_tx_bytes += skb->len;
+	vnic->netdev_stats.fastpath_tx_pkts += state.packets;
+#if NETFRONT_ACCEL_STATS
+	{
+		unsigned n;
+		n = vnic->netdev_stats.fastpath_tx_pkts -
+			vnic->stats.fastpath_tx_completions;
+		if (n > vnic->stats.fastpath_tx_pending_max)
+			vnic->stats.fastpath_tx_pending_max = n;
+	}
+#endif
+
+	return NETFRONT_ACCEL_STATUS_GOOD;
+ 
+ unwind:
+	tso_unwind(vnic, &state);
+
+	NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+
+	return NETFRONT_ACCEL_STATUS_BUSY;
+}
+
+
+
diff --git a/drivers/xen/sfc_netfront/accel_tso.h b/drivers/xen/sfc_netfront/accel_tso.h
new file mode 100644
index 0000000..b9c3ca8
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_tso.h
@@ -0,0 +1,57 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_TSO_H
+#define NETFRONT_ACCEL_TSO_H
+
+#include "accel_bufs.h"
+
+/* Track the buffers used in each output packet */
+struct netfront_accel_tso_buffer {
+	struct netfront_accel_tso_buffer *next;
+	struct netfront_accel_pkt_desc *buf;
+	unsigned length;
+};
+
+/* Track the output packets formed from each input packet */
+struct netfront_accel_tso_output_packet {
+	struct netfront_accel_tso_output_packet *next;
+	struct netfront_accel_tso_buffer *tso_bufs;
+	unsigned tso_bufs_len;
+};
+
+
+/*
+ * Max available space in a buffer for data once meta-data has taken
+ * its place 
+ */
+#define NETFRONT_ACCEL_TSO_BUF_LENGTH					\
+	((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)			\
+	 - sizeof(struct netfront_accel_tso_buffer)			\
+	 - sizeof(struct netfront_accel_tso_output_packet))
+
+int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic,
+				   struct sk_buff *skb);
+
+#endif /* NETFRONT_ACCEL_TSO_H */
diff --git a/drivers/xen/sfc_netfront/accel_vi.c b/drivers/xen/sfc_netfront/accel_vi.c
new file mode 100644
index 0000000..987a9f4
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_vi.c
@@ -0,0 +1,1203 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <asm/io.h>
+
+#include "accel.h"
+#include "accel_util.h"
+#include "accel_bufs.h"
+#include "accel_tso.h"
+#include "accel_ssr.h"
+#include "netfront.h"
+
+#include "etherfabric/ef_vi.h"
+
+/*
+ * Max available space in a buffer for data once meta-data has taken
+ * its place
+ */
+#define NETFRONT_ACCEL_TX_BUF_LENGTH					\
+	((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)			\
+	 - sizeof(struct netfront_accel_tso_buffer))
+
+#define ACCEL_TX_MAX_BUFFERS (6)
+#define ACCEL_VI_POLL_EVENTS (8)
+
+static
+int netfront_accel_vi_init_fini(netfront_accel_vnic *vnic, 
+				struct net_accel_msg_hw *hw_msg)
+{
+	struct ef_vi_nic_type nic_type;
+	struct net_accel_hw_falcon_b *hw_info;
+	void *io_kva, *evq_base, *rx_dma_kva, *tx_dma_kva, *doorbell_kva;
+	u32 *evq_gnts;
+	u32 evq_order;
+	int vi_state_size;
+	u8 vi_data[VI_MAPPINGS_SIZE];
+
+	if (hw_msg == NULL)
+		goto fini;
+
+	/* And create the local macs table lock */
+	spin_lock_init(&vnic->table_lock);
+	
+	/* Create fastpath table, initial size 8, key length 8 */
+	if (cuckoo_hash_init(&vnic->fastpath_table, 3, 8)) {
+		EPRINTK("failed to allocate fastpath table\n");
+		goto fail_cuckoo;
+	}
+
+	vnic->hw.falcon.type = hw_msg->type;
+
+	switch (hw_msg->type) {
+	case NET_ACCEL_MSG_HWTYPE_FALCON_A:
+		hw_info = &hw_msg->resources.falcon_a.common;
+		/* Need the extra rptr register page on A1 */
+		io_kva = net_accel_map_iomem_page
+			(vnic->dev, hw_msg->resources.falcon_a.evq_rptr_gnt,
+			 &vnic->hw.falcon.evq_rptr_mapping);
+		if (io_kva == NULL) {
+			EPRINTK("%s: evq_rptr permission failed\n", __FUNCTION__);
+			goto evq_rptr_fail;
+		}
+
+		vnic->hw.falcon.evq_rptr = io_kva + 
+			(hw_info->evq_rptr & (PAGE_SIZE - 1));
+		break;
+	case NET_ACCEL_MSG_HWTYPE_FALCON_B:
+	case NET_ACCEL_MSG_HWTYPE_SIENA_A:
+		hw_info = &hw_msg->resources.falcon_b;
+		break;
+	default:
+		goto bad_type;
+	}
+
+	/**** Event Queue ****/
+
+	/* Map the event queue pages */
+	evq_gnts = hw_info->evq_mem_gnts;
+	evq_order = hw_info->evq_order;
+
+	EPRINTK_ON(hw_info->evq_offs != 0);
+
+	DPRINTK("Will map evq %d pages\n", 1 << evq_order);
+
+	evq_base =
+		net_accel_map_grants_contig(vnic->dev, evq_gnts, 1 << evq_order,
+					    &vnic->evq_mapping);
+	if (evq_base == NULL) {
+		EPRINTK("%s: evq_base failed\n", __FUNCTION__);
+		goto evq_fail;
+	}
+
+	/**** Doorbells ****/
+	/* Set up the doorbell mappings. */
+	doorbell_kva = 
+		net_accel_map_iomem_page(vnic->dev, hw_info->doorbell_gnt,
+					 &vnic->hw.falcon.doorbell_mapping);
+	if (doorbell_kva == NULL) {
+		EPRINTK("%s: doorbell permission failed\n", __FUNCTION__);
+		goto doorbell_fail;
+	}
+	vnic->hw.falcon.doorbell = doorbell_kva;
+
+	/* On Falcon_B and Siena we get the rptr from the doorbell page */
+	if (hw_msg->type == NET_ACCEL_MSG_HWTYPE_FALCON_B ||
+	    hw_msg->type == NET_ACCEL_MSG_HWTYPE_SIENA_A) {
+		vnic->hw.falcon.evq_rptr = 
+			(u32 *)((char *)vnic->hw.falcon.doorbell 
+				+ hw_info->evq_rptr);
+	}
+
+	/**** DMA Queue ****/
+
+	/* Set up the DMA Queues from the message. */
+	tx_dma_kva = net_accel_map_grants_contig
+		(vnic->dev, &(hw_info->txdmaq_gnt), 1, 
+		 &vnic->hw.falcon.txdmaq_mapping);
+	if (tx_dma_kva == NULL) {
+		EPRINTK("%s: TX dma failed\n", __FUNCTION__);
+		goto tx_dma_fail;
+	}
+
+	rx_dma_kva = net_accel_map_grants_contig
+		(vnic->dev, &(hw_info->rxdmaq_gnt), 1, 
+		 &vnic->hw.falcon.rxdmaq_mapping);
+	if (rx_dma_kva == NULL) {
+		EPRINTK("%s: RX dma failed\n", __FUNCTION__);
+		goto rx_dma_fail;
+	}
+
+	/* Full confession */
+	DPRINTK("Mapped H/W"
+		"  Tx DMAQ grant %x -> %p\n"
+		"  Rx DMAQ grant %x -> %p\n"
+		"  EVQ grant %x -> %p\n",
+		hw_info->txdmaq_gnt, tx_dma_kva,
+		hw_info->rxdmaq_gnt, rx_dma_kva,
+		evq_gnts[0], evq_base
+		);
+
+	memset(vi_data, 0, sizeof(vi_data));
+	
+	/* TODO BUG11305: convert efhw_arch to ef_vi_arch
+	 * e.g.
+	 * arch = ef_vi_arch_from_efhw_arch(hw_info->nic_arch);
+	 * assert(arch >= 0);
+	 * nic_type.arch = arch;
+	 */
+	nic_type.arch = (unsigned char)hw_info->nic_arch;
+	nic_type.variant = (char)hw_info->nic_variant;
+	nic_type.revision = (unsigned char)hw_info->nic_revision;
+	
+	ef_vi_init_mapping_evq(vi_data, nic_type, hw_info->instance, 
+			       1 << (evq_order + PAGE_SHIFT), evq_base, 
+			       (void *)0xdeadbeef);
+
+	ef_vi_init_mapping_vi(vi_data, nic_type, hw_info->rx_capacity, 
+			      hw_info->tx_capacity, hw_info->instance, 
+			      doorbell_kva, rx_dma_kva, tx_dma_kva, 0);
+
+	vi_state_size = ef_vi_calc_state_bytes(hw_info->rx_capacity,
+					       hw_info->tx_capacity);
+	vnic->vi_state = (ef_vi_state *)kmalloc(vi_state_size, GFP_KERNEL);
+	if (vnic->vi_state == NULL) {
+		EPRINTK("%s: kmalloc for VI state failed\n", __FUNCTION__);
+		goto vi_state_fail;
+	}
+	ef_vi_init(&vnic->vi, vi_data, vnic->vi_state, &vnic->evq_state, 0);
+
+	ef_eventq_state_init(&vnic->vi);
+
+	ef_vi_state_init(&vnic->vi);
+
+	return 0;
+
+fini:
+	kfree(vnic->vi_state);
+	vnic->vi_state = NULL;
+vi_state_fail:
+	net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.rxdmaq_mapping);
+rx_dma_fail:
+	net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.txdmaq_mapping);
+tx_dma_fail:
+	net_accel_unmap_iomem_page(vnic->dev, vnic->hw.falcon.doorbell_mapping);
+	vnic->hw.falcon.doorbell = NULL;
+doorbell_fail:
+	net_accel_unmap_grants_contig(vnic->dev, vnic->evq_mapping);
+evq_fail:
+	if (vnic->hw.falcon.type == NET_ACCEL_MSG_HWTYPE_FALCON_A)
+		net_accel_unmap_iomem_page(vnic->dev, 
+					   vnic->hw.falcon.evq_rptr_mapping);
+	vnic->hw.falcon.evq_rptr = NULL;
+evq_rptr_fail:
+bad_type:
+	cuckoo_hash_destroy(&vnic->fastpath_table);
+fail_cuckoo:
+	return -EIO;
+}
+
+
+void netfront_accel_vi_ctor(netfront_accel_vnic *vnic)
+{
+	/* Just mark the VI as uninitialised. */
+	vnic->vi_state = NULL;
+}
+
+
+int netfront_accel_vi_init(netfront_accel_vnic *vnic, struct net_accel_msg_hw *hw_msg)
+{
+	BUG_ON(hw_msg == NULL);
+	return netfront_accel_vi_init_fini(vnic, hw_msg);
+}
+
+
+void netfront_accel_vi_dtor(netfront_accel_vnic *vnic)
+{
+	if (vnic->vi_state != NULL)
+		netfront_accel_vi_init_fini(vnic, NULL);
+}
+
+
+static
+void netfront_accel_vi_post_rx(netfront_accel_vnic *vnic, u16 id,
+			       netfront_accel_pkt_desc *buf)
+{
+
+	int idx = vnic->rx_dma_batched;
+
+#if 0
+	VPRINTK("Posting buffer %d (0x%08x) for rx at index %d, space is %d\n",
+		id, buf->pkt_buff_addr, idx, ef_vi_receive_space(&vnic->vi));
+#endif
+	/* Set up a virtual buffer descriptor */
+	ef_vi_receive_init(&vnic->vi, buf->pkt_buff_addr, id,
+			   /*rx_bytes=max*/0);
+
+	idx++;
+
+	vnic->rx_dma_level++;
+	
+	/* 
+	 * Only push the descriptor to the card if we've reached the
+	 * batch size.  Otherwise, the descriptors can sit around for
+	 * a while.  There will be plenty available.
+	 */
+	if (idx >= NETFRONT_ACCEL_RX_DESC_BATCH ||
+	    vnic->rx_dma_level < NETFRONT_ACCEL_RX_DESC_BATCH) {
+#if 0
+		VPRINTK("Flushing %d rx descriptors.\n", idx);
+#endif
+
+		/* Push buffer to hardware */
+		ef_vi_receive_push(&vnic->vi);
+		
+		idx = 0;
+	}
+	
+	vnic->rx_dma_batched = idx;
+}
+
+
+inline
+void netfront_accel_vi_post_rx_or_free(netfront_accel_vnic *vnic, u16 id,
+				       netfront_accel_pkt_desc *buf)
+{
+
+	VPRINTK("%s: %d\n", __FUNCTION__, id);
+
+	if (ef_vi_receive_space(&vnic->vi) <= vnic->rx_dma_batched) {
+		VPRINTK("RX space is full\n");
+		netfront_accel_buf_put(vnic->rx_bufs, id);
+		return;
+	}
+
+	VPRINTK("Completed buffer %d is reposted\n", id);
+	netfront_accel_vi_post_rx(vnic, id, buf);
+	
+	/*
+	 * Let's see if there's any more to be pushed out to the NIC
+	 * while we're here
+	 */
+	while (ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) {
+		/* Try to allocate a buffer. */
+		buf = netfront_accel_buf_get(vnic->rx_bufs);
+		if (buf == NULL)
+			break;
+		
+		/* Add it to the rx dma queue. */
+		netfront_accel_vi_post_rx(vnic, buf->buf_id, buf);	
+	}
+}
+
+
+void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx)
+{
+
+	while (is_rx && 
+	       ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) {
+		netfront_accel_pkt_desc *buf;
+		
+		VPRINTK("%s: %d\n", __FUNCTION__, vnic->rx_dma_level);
+		
+		/* Try to allocate a buffer. */
+		buf = netfront_accel_buf_get(vnic->rx_bufs);
+
+		if (buf == NULL)
+			break;
+		
+		/* Add it to the rx dma queue. */
+		netfront_accel_vi_post_rx(vnic, buf->buf_id, buf);
+	}
+
+	VPRINTK("%s: done\n", __FUNCTION__);
+}
+
+
+struct netfront_accel_multi_state {
+	unsigned remaining_len;
+
+	unsigned buffers;
+
+	struct netfront_accel_tso_buffer *output_buffers;
+
+	/* Where we are in the current fragment of the SKB. */
+	struct {
+		/* address of current position */
+		void *addr;
+		/* remaining length */	  
+		unsigned int len;
+	} ifc; /*  == Input Fragment Cursor */
+};
+
+
+static inline void multi_post_start(struct netfront_accel_multi_state *st, 
+				    struct sk_buff *skb)
+{
+	st->remaining_len = skb->len;
+	st->output_buffers = NULL;
+	st->buffers = 0;
+	st->ifc.len = skb_headlen(skb);
+	st->ifc.addr = skb->data;
+}
+
+static int multi_post_start_new_buffer(netfront_accel_vnic *vnic, 
+				       struct netfront_accel_multi_state *st)
+{
+	struct netfront_accel_tso_buffer *tso_buf;
+	struct netfront_accel_pkt_desc *buf;
+
+	/* Get a mapped packet buffer */
+	buf = netfront_accel_buf_get(vnic->tx_bufs);
+	if (buf == NULL) {
+		DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
+		return -1;
+	}
+
+	/* Store a bit of meta-data at the end */
+	tso_buf = (struct netfront_accel_tso_buffer *)
+		(buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
+
+	tso_buf->buf = buf;
+
+	tso_buf->length = 0;
+	
+	tso_buf->next = st->output_buffers;
+	st->output_buffers = tso_buf;
+	st->buffers++;
+
+	BUG_ON(st->buffers >= ACCEL_TX_MAX_BUFFERS);
+
+	/*
+	 * Store the context, set to NULL, last packet buffer will get
+	 * non-NULL later
+	 */
+	tso_buf->buf->skb = NULL;
+	
+	return 0;
+}
+
+
+static void
+multi_post_fill_buffer_with_fragment(netfront_accel_vnic *vnic,
+				     struct netfront_accel_multi_state *st)
+{
+	struct netfront_accel_tso_buffer *tso_buf;
+	unsigned n, space;
+
+	BUG_ON(st->output_buffers == NULL);
+	tso_buf = st->output_buffers;
+
+	if (st->ifc.len == 0) return;
+	if (tso_buf->length == NETFRONT_ACCEL_TX_BUF_LENGTH) return;
+
+	BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH);
+
+	space = NETFRONT_ACCEL_TX_BUF_LENGTH - tso_buf->length;
+	n = min(st->ifc.len, space);
+
+	memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n);
+
+	st->remaining_len -= n;
+	st->ifc.len -= n;
+	tso_buf->length += n;
+	st->ifc.addr += n;
+
+	BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH);
+
+	return;
+}
+
+
+static inline void multi_post_unwind(netfront_accel_vnic *vnic,
+				     struct netfront_accel_multi_state *st)
+{
+	struct netfront_accel_tso_buffer *tso_buf;
+
+	DPRINTK("%s\n", __FUNCTION__);
+
+	while (st->output_buffers != NULL) {
+		tso_buf = st->output_buffers;
+		st->output_buffers = tso_buf->next;
+		st->buffers--;
+		netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id);
+	}
+	BUG_ON(st->buffers != 0);
+}
+
+
+static enum netfront_accel_post_status
+netfront_accel_enqueue_skb_multi(netfront_accel_vnic *vnic, struct sk_buff *skb)
+{
+	struct netfront_accel_tso_buffer *tso_buf;
+	struct netfront_accel_multi_state state;
+	ef_iovec iovecs[ACCEL_TX_MAX_BUFFERS];
+	skb_frag_t *f;
+	int frag_i, rc, dma_id;
+
+	multi_post_start(&state, skb);
+
+	frag_i = -1;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		/* Set to zero to encourage falcon to work it out for us */
+		*(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
+	}
+
+	if (multi_post_start_new_buffer(vnic, &state)) {
+		DPRINTK("%s: out of buffers\n", __FUNCTION__);
+		goto unwind;
+	}
+
+	while (1) {
+		multi_post_fill_buffer_with_fragment(vnic, &state);
+
+		/* Move onto the next fragment? */
+		if (state.ifc.len == 0) {
+			if (++frag_i >= skb_shinfo(skb)->nr_frags)
+				/* End of payload reached. */
+				break;
+			f = &skb_shinfo(skb)->frags[frag_i];
+			state.ifc.len = skb_frag_size(f);
+			state.ifc.addr = page_address(skb_frag_page(f))
+					 + f->page_offset;
+		}
+
+		/* Start a new buffer? */
+		if ((state.output_buffers->length == 
+		     NETFRONT_ACCEL_TX_BUF_LENGTH) &&
+		    multi_post_start_new_buffer(vnic, &state)) {
+			DPRINTK("%s: out of buffers\n", __FUNCTION__);
+			goto unwind;
+		}
+	}
+
+	/* Check for space */
+	if (ef_vi_transmit_space(&vnic->vi) < state.buffers) {
+		DPRINTK("%s: Not enough TX space (%d)\n", __FUNCTION__, state.buffers);
+		goto unwind;
+	}
+
+	/* Store the skb in what will be the last buffer's context */
+	state.output_buffers->buf->skb = skb;
+	/* Remember dma_id of what will be the last buffer */ 
+	dma_id = state.output_buffers->buf->buf_id;
+
+	/*
+	 * Make an iovec of the buffers in the list, reversing the
+	 * buffers as we go as they are constructed on a stack
+	 */
+	tso_buf = state.output_buffers;
+	for (frag_i = state.buffers-1; frag_i >= 0; frag_i--) {
+		iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr;
+		iovecs[frag_i].iov_len = tso_buf->length;
+		tso_buf = tso_buf->next;
+	}
+	
+	rc = ef_vi_transmitv(&vnic->vi, iovecs, state.buffers, dma_id);
+	/* We checked for space already, so it really should succeed */
+	BUG_ON(rc != 0);
+
+	/* Track number of tx fastpath stats */
+	vnic->netdev_stats.fastpath_tx_bytes += skb->len;
+	vnic->netdev_stats.fastpath_tx_pkts ++;
+#if NETFRONT_ACCEL_STATS
+	{
+		u32 n;
+		n = vnic->netdev_stats.fastpath_tx_pkts -
+			(u32)vnic->stats.fastpath_tx_completions;
+		if (n > vnic->stats.fastpath_tx_pending_max)
+			vnic->stats.fastpath_tx_pending_max = n;
+	}
+#endif
+	return NETFRONT_ACCEL_STATUS_GOOD;
+
+unwind:
+	multi_post_unwind(vnic, &state);
+
+	NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+
+	return NETFRONT_ACCEL_STATUS_BUSY;
+}
+
+
+static enum netfront_accel_post_status 
+netfront_accel_enqueue_skb_single(netfront_accel_vnic *vnic, struct sk_buff *skb)
+{
+	struct netfront_accel_tso_buffer *tso_buf;
+	struct netfront_accel_pkt_desc *buf;
+	u8 *kva;
+	int rc;
+
+	if (ef_vi_transmit_space(&vnic->vi) < 1) {
+		DPRINTK("%s: No TX space\n", __FUNCTION__);
+		NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+		return NETFRONT_ACCEL_STATUS_BUSY;
+	}
+
+	buf = netfront_accel_buf_get(vnic->tx_bufs);
+	if (buf == NULL) {
+		DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
+		NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+		return NETFRONT_ACCEL_STATUS_BUSY;
+	}
+
+	/* Track number of tx fastpath stats */
+	vnic->netdev_stats.fastpath_tx_pkts++;
+	vnic->netdev_stats.fastpath_tx_bytes += skb->len;
+
+#if NETFRONT_ACCEL_STATS
+	{
+		u32 n;
+		n = vnic->netdev_stats.fastpath_tx_pkts - 
+			(u32)vnic->stats.fastpath_tx_completions;
+		if (n > vnic->stats.fastpath_tx_pending_max)
+			vnic->stats.fastpath_tx_pending_max = n;
+	}
+#endif
+	
+	/* Store the context */
+	buf->skb = skb;
+	
+	kva = buf->pkt_kva;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		/* Set to zero to encourage falcon to work it out for us */
+		*(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
+	}
+	NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
+		(skb, idx, frag_data, frag_len, {
+			/* Copy in payload */
+			VPRINTK("*** Copying %d bytes to %p\n", frag_len, kva);
+			memcpy(kva, frag_data, frag_len);
+			kva += frag_len;
+		});
+
+	VPRINTK("%s: id %d pkt %p kva %p buff_addr 0x%08x\n", __FUNCTION__,
+		buf->buf_id, buf, buf->pkt_kva, buf->pkt_buff_addr);
+
+
+	/* Set up the TSO meta-data for a single buffer/packet */
+	tso_buf = (struct netfront_accel_tso_buffer *)
+		(buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
+	tso_buf->next = NULL;
+	tso_buf->buf = buf;
+	tso_buf->length = skb->len;
+
+	rc = ef_vi_transmit(&vnic->vi, buf->pkt_buff_addr, skb->len,
+			    buf->buf_id);
+	/* We checked for space already, so it really should succeed */
+	BUG_ON(rc != 0);
+
+	return NETFRONT_ACCEL_STATUS_GOOD;
+}
+
+
+enum netfront_accel_post_status 
+netfront_accel_vi_tx_post(netfront_accel_vnic *vnic, struct sk_buff *skb)
+{
+	struct ethhdr *pkt_eth_hdr;
+	struct iphdr *pkt_ipv4_hdr;
+	int value, try_fastpath;
+
+	/*
+	 * This assumes that the data field points to the dest mac
+	 * address.
+	 */
+	cuckoo_hash_mac_key key = cuckoo_mac_to_key(skb->data);
+
+	/*
+	 * NB very important that all things that could return "CANT"
+	 * are tested before things that return "BUSY" as if it it
+	 * returns "BUSY" it is assumed that it won't return "CANT"
+	 * next time it is tried
+	 */
+
+	/*
+	 * Do a fastpath send if fast path table lookup returns true.
+	 * We do this without the table lock and so may get the wrong
+	 * answer, but current opinion is that's not a big problem 
+	 */
+	try_fastpath = cuckoo_hash_lookup(&vnic->fastpath_table, 
+					  (cuckoo_hash_key *)(&key), &value);
+
+	if (!try_fastpath) {
+		VPRINTK("try fast path false for mac: %pM\n", skb->data);
+		
+		return NETFRONT_ACCEL_STATUS_CANT;
+	}
+
+	/* Check to see if the packet can be sent. */
+	if (skb_headlen(skb) < sizeof(*pkt_eth_hdr) + sizeof(*pkt_ipv4_hdr)) {
+		EPRINTK("%s: Packet header is too small\n", __FUNCTION__);
+		return NETFRONT_ACCEL_STATUS_CANT;
+	}
+
+	pkt_eth_hdr  = (void*)skb->data;
+	pkt_ipv4_hdr = (void*)(pkt_eth_hdr+1);
+
+	if (be16_to_cpu(pkt_eth_hdr->h_proto) != ETH_P_IP) {
+		DPRINTK("%s: Packet is not IPV4 (ether_type=0x%04x)\n", __FUNCTION__,
+			be16_to_cpu(pkt_eth_hdr->h_proto));
+		return NETFRONT_ACCEL_STATUS_CANT;
+	}
+	
+	if (pkt_ipv4_hdr->protocol != IPPROTO_TCP &&
+	    pkt_ipv4_hdr->protocol != IPPROTO_UDP) {
+		DPRINTK("%s: Packet is not TCP/UDP (ip_protocol=0x%02x)\n",
+			__FUNCTION__, pkt_ipv4_hdr->protocol);
+		return NETFRONT_ACCEL_STATUS_CANT;
+	}
+	
+	VPRINTK("%s: %d bytes, gso %d\n", __FUNCTION__, skb->len, 
+		skb_shinfo(skb)->gso_size);
+	
+	if (skb_shinfo(skb)->gso_size) {
+		return netfront_accel_enqueue_skb_tso(vnic, skb);
+	}
+
+	if (skb->len <= NETFRONT_ACCEL_TX_BUF_LENGTH) {
+		return netfront_accel_enqueue_skb_single(vnic, skb);
+	}
+
+	return netfront_accel_enqueue_skb_multi(vnic, skb);
+}
+
+
+/*
+ * Copy the data to required end destination. NB. len is the total new
+ * length of the socket buffer, not the amount of data to copy
+ */
+inline
+int ef_vnic_copy_to_skb(netfront_accel_vnic *vnic, struct sk_buff *skb, 
+			struct netfront_accel_pkt_desc *buf, int len)
+{
+	int i, extra = len - skb->len;
+	char c;
+	int pkt_stride = vnic->rx_pkt_stride;
+	int skb_stride = vnic->rx_skb_stride;
+	char *skb_start;
+	
+	/*
+	 * This pulls stuff into the cache - have seen performance
+	 * benefit in this, but disabled by default
+	 */
+	skb_start = skb->data;
+	if (pkt_stride) {
+		for (i = 0; i < len; i += pkt_stride) {
+			c += ((volatile char*)(buf->pkt_kva))[i];
+		}
+	}
+	if (skb_stride) {
+		for (i = skb->len; i < len ; i += skb_stride) {
+			c += ((volatile char*)(skb_start))[i];
+		}
+	}
+
+	if (skb_tailroom(skb) >= extra) {
+		memcpy(skb_put(skb, extra), buf->pkt_kva, extra);
+		return 0;
+	}
+
+	return -ENOSPC;
+}
+
+
+static void discard_jumbo_state(netfront_accel_vnic *vnic) 
+{
+
+	if (vnic->jumbo_state.skb != NULL) {
+		dev_kfree_skb_any(vnic->jumbo_state.skb);
+
+		vnic->jumbo_state.skb = NULL;
+	}
+	vnic->jumbo_state.in_progress = 0;
+}
+
+
+static void  netfront_accel_vi_rx_complete(netfront_accel_vnic *vnic,
+					   struct sk_buff *skb)
+{
+	cuckoo_hash_mac_key key;
+	unsigned long flags;
+	int value;
+	struct net_device *net_dev;
+
+
+	key = cuckoo_mac_to_key(skb->data + ETH_ALEN);
+
+	/*
+	 * If this is a MAC address that we want to do fast path TX
+	 * to, and we don't already, add it to the fastpath table.
+	 * The initial lookup is done without the table lock and so
+	 * may get the wrong answer, but current opinion is that's not
+	 * a big problem
+	 */
+	if (is_valid_ether_addr(skb->data + ETH_ALEN) &&
+	    !cuckoo_hash_lookup(&vnic->fastpath_table, (cuckoo_hash_key *)&key,
+				&value)) {
+		spin_lock_irqsave(&vnic->table_lock, flags);
+		   
+		cuckoo_hash_add_check(&vnic->fastpath_table,
+				      (cuckoo_hash_key *)&key,
+				      1, 1);
+		
+		spin_unlock_irqrestore(&vnic->table_lock, flags);
+	}
+
+	if (compare_ether_addr(skb->data, vnic->mac)) {
+		struct iphdr *ip = (struct iphdr *)(skb->data + ETH_HLEN);
+		u16 port;
+
+		DPRINTK("%s: saw wrong MAC address %pM\n",
+			__FUNCTION__, skb->data);
+
+		if (ip->protocol == IPPROTO_TCP) {
+			struct tcphdr *tcp = (struct tcphdr *)
+				((char *)ip + 4 * ip->ihl);
+			port = tcp->dest;
+		} else {
+			struct udphdr *udp = (struct udphdr *)
+				((char *)ip + 4 * ip->ihl);
+			EPRINTK_ON(ip->protocol != IPPROTO_UDP);
+			port = udp->dest;
+		}
+
+		netfront_accel_msg_tx_fastpath(vnic, skb->data,
+					       ip->daddr, port,
+					       ip->protocol);
+	}
+
+	net_dev = vnic->net_dev;
+	skb->protocol = eth_type_trans(skb, net_dev);
+	/* CHECKSUM_UNNECESSARY as hardware has done it already */
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (!netfront_accel_ssr_skb(vnic, &vnic->ssr_state, skb))
+		netif_receive_skb(skb);
+}
+
+
+static int netfront_accel_vi_poll_process_rx(netfront_accel_vnic *vnic, 
+					     ef_event *ev)
+{
+	struct netfront_accel_bufinfo *bufinfo = vnic->rx_bufs;
+	struct netfront_accel_pkt_desc *buf = NULL;
+	struct sk_buff *skb;
+	int id, len, sop = 0, cont = 0;
+
+	VPRINTK("Rx event.\n");
+	/*
+	 * Complete the receive operation, and get the request id of
+	 * the buffer
+	 */
+	id = ef_vi_receive_done(&vnic->vi, ev);
+
+	if (id < 0 || id >= bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE) {
+		EPRINTK("Rx packet %d is invalid\n", id);
+		/* Carry on round the loop if more events */
+		goto bad_packet;
+	}
+	/* Get our buffer descriptor */
+	buf = netfront_accel_buf_find(bufinfo, id);
+
+	len = EF_EVENT_RX_BYTES(*ev);
+
+	/* An RX buffer has been removed from the DMA ring. */
+	vnic->rx_dma_level--;
+
+	if (EF_EVENT_TYPE(*ev) == EF_EVENT_TYPE_RX) {
+		sop = EF_EVENT_RX_SOP(*ev);
+		cont = EF_EVENT_RX_CONT(*ev);
+
+		skb = vnic->jumbo_state.skb;
+
+		VPRINTK("Rx packet %d: %d bytes so far; sop %d; cont %d\n", 
+			id, len, sop, cont);
+
+		if (sop) {
+			if (!vnic->jumbo_state.in_progress) {
+				vnic->jumbo_state.in_progress = 1;
+				BUG_ON(vnic->jumbo_state.skb != NULL);
+			} else {
+				/*
+				 * This fragment shows a missing tail in 
+				 * previous one, but is itself possibly OK
+				 */
+				DPRINTK("sop and in_progress => no tail\n");
+
+				/* Release the socket buffer we already had */
+				discard_jumbo_state(vnic);
+
+				/* Now start processing this fragment */
+				vnic->jumbo_state.in_progress = 1;
+				skb = NULL;
+			}
+		} else if (!vnic->jumbo_state.in_progress) {
+			DPRINTK("!sop and !in_progress => missing head\n");
+			goto missing_head;
+		}
+
+		if (!cont) {
+			/* Update state for next time */
+			vnic->jumbo_state.in_progress = 0;
+			vnic->jumbo_state.skb = NULL;
+		} else if (!vnic->jumbo_state.in_progress) {
+			DPRINTK("cont and !in_progress => missing head\n");
+			goto missing_head;
+		}
+
+		if (skb == NULL) {
+			BUG_ON(!sop);
+
+			if (!cont)
+				skb = alloc_skb(len+NET_IP_ALIGN, GFP_ATOMIC);
+			else
+				skb = alloc_skb(vnic->net_dev->mtu+NET_IP_ALIGN, 
+						GFP_ATOMIC);
+
+			if (skb == NULL) {
+				DPRINTK("%s: Couldn't get an rx skb.\n",
+					__FUNCTION__);
+				netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+				/*
+				 * Dropping this fragment means we
+				 * should discard the rest too
+				 */
+				discard_jumbo_state(vnic);
+
+				/* Carry on round the loop if more events */
+				return 0;
+			}
+
+		}
+		
+		/* Copy the data to required end destination */
+		if (ef_vnic_copy_to_skb(vnic, skb, buf, len) != 0) {
+			/*
+			 * No space in the skb - suggests > MTU packet
+			 * received
+			 */
+			EPRINTK("%s: Rx packet too large (%d)\n",
+				__FUNCTION__, len);
+			netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+			discard_jumbo_state(vnic);
+			return 0;
+		}
+		
+		/* Put the buffer back in the DMA queue. */
+		netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+
+		if (cont) {
+			vnic->jumbo_state.skb = skb;
+
+			return 0;
+		} else {
+			/* Track number of rx fastpath packets */
+			vnic->netdev_stats.fastpath_rx_pkts++;
+			vnic->netdev_stats.fastpath_rx_bytes += len;
+
+			netfront_accel_vi_rx_complete(vnic, skb);
+
+			return 1;
+		}
+	} else {
+		BUG_ON(EF_EVENT_TYPE(*ev) != EF_EVENT_TYPE_RX_DISCARD);
+
+		if (EF_EVENT_RX_DISCARD_TYPE(*ev) 
+		    == EF_EVENT_RX_DISCARD_TRUNC) {
+			DPRINTK("%s: " EF_EVENT_FMT 
+				" buffer %d FRM_TRUNC q_id %d\n",
+				__FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+				EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+			NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_frm_trunc);
+		} else if (EF_EVENT_RX_DISCARD_TYPE(*ev) 
+			  == EF_EVENT_RX_DISCARD_OTHER) {
+			DPRINTK("%s: " EF_EVENT_FMT 
+				" buffer %d RX_DISCARD_OTHER q_id %d\n",
+				__FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+				EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+			NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_discard_other);
+		} else if (EF_EVENT_RX_DISCARD_TYPE(*ev) ==
+			   EF_EVENT_RX_DISCARD_CSUM_BAD) {
+			DPRINTK("%s: " EF_EVENT_FMT 
+				" buffer %d DISCARD CSUM_BAD q_id %d\n",
+				__FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+				EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+			NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_csum_bad);
+		} else if (EF_EVENT_RX_DISCARD_TYPE(*ev) ==
+			   EF_EVENT_RX_DISCARD_CRC_BAD) {
+			DPRINTK("%s: " EF_EVENT_FMT 
+				" buffer %d DISCARD CRC_BAD q_id %d\n",
+				__FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+				EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+			NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_crc_bad);
+		} else {
+			BUG_ON(EF_EVENT_RX_DISCARD_TYPE(*ev) !=
+			       EF_EVENT_RX_DISCARD_RIGHTS);
+			DPRINTK("%s: " EF_EVENT_FMT 
+				" buffer %d DISCARD RIGHTS q_id %d\n",
+				__FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+				EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+			NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_rights_bad);
+		}
+	}
+
+	/* discard type drops through here */
+
+bad_packet:
+	/* Release the socket buffer we already had */
+	discard_jumbo_state(vnic);
+
+missing_head:
+	BUG_ON(vnic->jumbo_state.in_progress != 0);
+	BUG_ON(vnic->jumbo_state.skb != NULL);
+
+	if (id >= 0 && id < bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE)
+		/* Put the buffer back in the DMA queue. */
+		netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+
+	vnic->netdev_stats.fastpath_rx_errors++;
+
+	DPRINTK("%s experienced bad packet/missing fragment error: %d \n",
+		__FUNCTION__, ev->rx.flags);
+
+	return 0;
+}
+
+
+static void netfront_accel_vi_not_busy(netfront_accel_vnic *vnic)
+{
+	struct netfront_info *np = ((struct netfront_info *)
+				    netdev_priv(vnic->net_dev));
+	int handled;
+	unsigned long flags;
+
+	/*
+	 * We hold the vnic tx_lock which is sufficient to exclude
+	 * writes to tx_skb
+	 */
+
+	if (vnic->tx_skb != NULL) {
+		DPRINTK("%s trying to send spare buffer\n", __FUNCTION__);
+		
+		handled = netfront_accel_vi_tx_post(vnic, vnic->tx_skb);
+		
+		if (handled != NETFRONT_ACCEL_STATUS_BUSY) {
+			DPRINTK("%s restarting tx\n", __FUNCTION__);
+
+			/* Need netfront tx_lock and vnic tx_lock to
+			 * write tx_skb */
+			spin_lock_irqsave(&np->tx_lock, flags);
+
+			vnic->tx_skb = NULL;
+
+			if (netfront_check_queue_ready(vnic->net_dev)) {
+				netif_wake_queue(vnic->net_dev);
+				NETFRONT_ACCEL_STATS_OP
+					(vnic->stats.queue_wakes++);
+			}
+			spin_unlock_irqrestore(&np->tx_lock, flags);
+
+		}
+		
+		/*
+		 * Should never get a CANT, as it checks that before
+		 * deciding it was BUSY first time round 
+		 */
+		BUG_ON(handled == NETFRONT_ACCEL_STATUS_CANT);
+	}
+}
+
+
+static void netfront_accel_vi_tx_complete(netfront_accel_vnic *vnic, 
+					  struct netfront_accel_tso_buffer *tso_buf,
+					  int is_last)
+{
+	struct netfront_accel_tso_buffer *next;
+
+	/* 
+	 * We get a single completion for every call to
+	 * ef_vi_transmitv so handle any other buffers which are part
+	 * of the same packet 
+	 */
+	while (tso_buf != NULL) {
+		if (tso_buf->buf->skb != NULL) {
+			dev_kfree_skb_any(tso_buf->buf->skb);
+			tso_buf->buf->skb = NULL;
+		}
+
+		next = tso_buf->next;
+
+		netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id);
+
+		tso_buf = next;
+	}
+
+	/*
+	 * If this was the last one in the batch, we try and send any
+	 * pending tx_skb. There should now be buffers and
+	 * descriptors
+	 */
+	if (is_last)
+		netfront_accel_vi_not_busy(vnic);
+}
+
+
+static void netfront_accel_vi_poll_process_tx(netfront_accel_vnic *vnic,
+					      ef_event *ev)
+{
+	struct netfront_accel_pkt_desc *buf;
+	struct netfront_accel_tso_buffer *tso_buf;
+	ef_request_id ids[EF_VI_TRANSMIT_BATCH];
+	int i, n_ids;
+	unsigned long flags;
+
+	/* Get the request ids for this tx completion event. */
+	n_ids = ef_vi_transmit_unbundle(&vnic->vi, ev, ids);
+
+	/* Take the tx buffer spin lock and hold for the duration */
+	spin_lock_irqsave(&vnic->tx_lock, flags);
+
+	for (i = 0; i < n_ids; ++i) {
+		VPRINTK("Tx packet %d complete\n", ids[i]);
+		buf = netfront_accel_buf_find(vnic->tx_bufs, ids[i]);
+		NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_completions++);
+
+		tso_buf = (struct netfront_accel_tso_buffer *)
+			(buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
+		BUG_ON(tso_buf->buf != buf);
+
+		netfront_accel_vi_tx_complete(vnic, tso_buf, i == (n_ids-1));
+	}
+
+	spin_unlock_irqrestore(&vnic->tx_lock, flags);
+}
+
+
+int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets)
+{
+	ef_event ev[ACCEL_VI_POLL_EVENTS];
+	int rx_remain = rx_packets, rc, events, i;
+#if NETFRONT_ACCEL_STATS
+	int n_evs_polled = 0, rx_evs_polled = 0, tx_evs_polled = 0;
+#endif
+	BUG_ON(rx_packets <= 0);
+
+	events = ef_eventq_poll(&vnic->vi, ev, 
+				min(rx_remain, ACCEL_VI_POLL_EVENTS));
+	i = 0;
+	NETFRONT_ACCEL_STATS_OP(n_evs_polled += events);
+
+	VPRINTK("%s: %d events\n", __FUNCTION__, events);
+
+	/* Loop over each event */
+	while (events) {
+		VPRINTK("%s: Event "EF_EVENT_FMT", index %lu\n", __FUNCTION__, 
+			EF_EVENT_PRI_ARG(ev[i]),	
+			(unsigned long)(vnic->vi.evq_state->evq_ptr));
+
+		if ((EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX) ||
+		    (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX_DISCARD)) {
+			rc = netfront_accel_vi_poll_process_rx(vnic, &ev[i]);
+			rx_remain -= rc;
+			BUG_ON(rx_remain < 0);
+			NETFRONT_ACCEL_STATS_OP(rx_evs_polled++);
+		} else if (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_TX) {
+			netfront_accel_vi_poll_process_tx(vnic, &ev[i]);
+			NETFRONT_ACCEL_STATS_OP(tx_evs_polled++);
+		} else if (EF_EVENT_TYPE(ev[i]) == 
+			   EF_EVENT_TYPE_RX_NO_DESC_TRUNC) {
+			DPRINTK("%s: RX_NO_DESC_TRUNC " EF_EVENT_FMT "\n",
+				__FUNCTION__, EF_EVENT_PRI_ARG(ev[i]));
+			discard_jumbo_state(vnic);
+			NETFRONT_ACCEL_STATS_OP(vnic->stats.rx_no_desc_trunc++);
+		} else {
+			EPRINTK("Unexpected event " EF_EVENT_FMT "\n", 
+				EF_EVENT_PRI_ARG(ev[i]));
+			NETFRONT_ACCEL_STATS_OP(vnic->stats.bad_event_count++);
+		}
+
+		i++;
+
+		/* Carry on round the loop if more events and more space */
+		if (i == events) {
+			if (rx_remain == 0)
+				break;
+
+			events = ef_eventq_poll(&vnic->vi, ev, 
+						min(rx_remain, 
+						    ACCEL_VI_POLL_EVENTS));
+			i = 0;
+			NETFRONT_ACCEL_STATS_OP(n_evs_polled += events);
+		}
+	}
+	
+#if NETFRONT_ACCEL_STATS
+	vnic->stats.event_count += n_evs_polled;
+	vnic->stats.event_count_since_irq += n_evs_polled;
+	if (n_evs_polled > vnic->stats.events_per_poll_max)
+		vnic->stats.events_per_poll_max = n_evs_polled;
+	if (rx_evs_polled > vnic->stats.events_per_poll_rx_max)
+		vnic->stats.events_per_poll_rx_max = rx_evs_polled;
+	if (tx_evs_polled > vnic->stats.events_per_poll_tx_max)
+		vnic->stats.events_per_poll_tx_max = tx_evs_polled;
+#endif
+
+	return rx_packets - rx_remain;
+}
+
+
+int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic)
+{
+	u32 sw_evq_ptr;
+
+	VPRINTK("%s: checking for event on %p\n", __FUNCTION__, &vnic->vi.evq_state);
+
+	BUG_ON(vnic == NULL);
+	BUG_ON(vnic->vi.evq_state == NULL);
+
+	/* Do a quick check for an event. */
+	if (ef_eventq_has_event(&vnic->vi)) {
+		VPRINTK("%s: found event\n",  __FUNCTION__);
+		return 0;
+	}
+
+	VPRINTK("evq_ptr=0x%08x	 evq_mask=0x%08x\n",
+		vnic->evq_state.evq_ptr, vnic->vi.evq_mask);
+  
+	/* Request a wakeup from the hardware. */
+	sw_evq_ptr = vnic->evq_state.evq_ptr & vnic->vi.evq_mask;
+
+	BUG_ON(vnic->hw.falcon.evq_rptr == NULL);
+
+	VPRINTK("Requesting wakeup at 0x%08x, rptr %p\n", sw_evq_ptr,
+		vnic->hw.falcon.evq_rptr);
+	*(volatile u32 *)(vnic->hw.falcon.evq_rptr) = (sw_evq_ptr >> 3);
+
+	return 1;
+}
diff --git a/drivers/xen/sfc_netfront/accel_xenbus.c b/drivers/xen/sfc_netfront/accel_xenbus.c
new file mode 100644
index 0000000..98d5334
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_xenbus.c
@@ -0,0 +1,775 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+
+#include "accel.h"
+#include "accel_util.h"
+#include "accel_msg_iface.h"
+#include "accel_bufs.h"
+#include "accel_ssr.h"
+/* drivers/xen/netfront/netfront.h */
+#include "netfront.h"
+
+void netfront_accel_set_closing(netfront_accel_vnic *vnic) 
+{
+
+	vnic->frontend_state = XenbusStateClosing;
+	net_accel_update_state(vnic->dev, XenbusStateClosing);
+}
+	
+
+static void mac_address_change(struct xenbus_watch *watch,
+			       const char **vec, unsigned int len)
+{
+	netfront_accel_vnic *vnic;
+	struct xenbus_device *dev;
+	int rc;
+
+	DPRINTK("%s\n", __FUNCTION__);
+	
+	vnic = container_of(watch, netfront_accel_vnic, 
+				mac_address_watch);
+	dev = vnic->dev;
+
+	rc = net_accel_xen_net_read_mac(dev, vnic->mac);
+
+	if (rc != 0)
+		EPRINTK("%s: failed to read mac (%d)\n", __FUNCTION__, rc);
+}
+
+
+static int setup_mac_address_watch(struct xenbus_device *dev,
+				   netfront_accel_vnic *vnic)
+{
+	int err;
+
+	DPRINTK("Setting watch on %s/%s\n", dev->nodename, "mac");
+
+	err = xenbus_watch_path2(dev, dev->nodename, "mac", 
+				 &vnic->mac_address_watch, 
+				 mac_address_change);
+	if (err) {
+		EPRINTK("%s: Failed to register xenbus watch: %d\n",
+			__FUNCTION__, err);
+		goto fail;
+	}
+
+	return 0;
+ fail:
+	vnic->mac_address_watch.node = NULL;
+	return err;
+}
+
+
+/* Grant access to some pages and publish through xenbus */
+static int make_named_grant(struct xenbus_device *dev, void *page, 
+			    const char *name, grant_ref_t *gnt_ref)
+{
+	struct xenbus_transaction tr;
+	int err;
+	grant_ref_t gnt;
+
+	gnt = net_accel_grant_page(dev, virt_to_mfn(page), 0);
+	if (gnt < 0)
+		return gnt;
+
+	do {
+		err = xenbus_transaction_start(&tr);
+		if (err != 0) {
+			EPRINTK("%s: transaction start failed %d\n",
+				__FUNCTION__, err);
+			return err;
+		}
+		err = xenbus_printf(tr, dev->nodename, name, "%d", gnt);
+		if (err != 0) {
+			EPRINTK("%s: xenbus_printf failed %d\n", __FUNCTION__,
+				err);
+			xenbus_transaction_end(tr, 1);
+			return err;
+		}
+		err = xenbus_transaction_end(tr, 0);
+	} while (err == -EAGAIN);
+	
+	if (err != 0) {
+		EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err);
+		return err;
+	}
+	
+	*gnt_ref = gnt;
+
+	return 0;
+}
+
+
+static int remove_named_grant(struct xenbus_device *dev,
+			      const char *name, grant_ref_t gnt_ref)
+{
+	struct xenbus_transaction tr;
+	int err;
+
+	net_accel_ungrant_page(gnt_ref);
+
+	do {
+		err = xenbus_transaction_start(&tr);
+		if (err != 0) {
+			EPRINTK("%s: transaction start failed %d\n",
+				__FUNCTION__, err);
+			return err;
+		}
+		err = xenbus_rm(tr, dev->nodename, name);
+		if (err != 0) {
+			EPRINTK("%s: xenbus_rm failed %d\n", __FUNCTION__,
+				err);
+			xenbus_transaction_end(tr, 1);
+			return err;
+		}
+		err = xenbus_transaction_end(tr, 0);
+	} while (err == -EAGAIN);
+	
+	if (err != 0) {
+		EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err);
+		return err;
+	}
+
+	return 0;
+}
+
+
+static 
+netfront_accel_vnic *netfront_accel_vnic_ctor(struct net_device *net_dev,
+					      struct xenbus_device *dev)
+{
+	struct netfront_info *np =
+		(struct netfront_info *)netdev_priv(net_dev);
+	netfront_accel_vnic *vnic;
+	int err;
+
+	/*
+	 * A bug in earlier versions of Xen accel plugin system meant
+	 * you could be probed twice for the same device on suspend
+	 * cancel.  Be tolerant of that.
+	 */ 
+	if (np->accel_priv != NULL)
+		return ERR_PTR(-EALREADY);
+
+	/* Alloc mem for state */
+	vnic = kzalloc(sizeof(netfront_accel_vnic), GFP_KERNEL);
+	if (vnic == NULL) {
+		EPRINTK("%s: no memory for vnic state\n", __FUNCTION__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	spin_lock_init(&vnic->tx_lock);
+
+	mutex_init(&vnic->vnic_mutex);
+	mutex_lock(&vnic->vnic_mutex);
+
+	/* Store so state can be retrieved from device */
+	BUG_ON(np->accel_priv != NULL);
+	np->accel_priv = vnic;
+	vnic->dev = dev;
+	vnic->net_dev = net_dev;
+	spin_lock_init(&vnic->irq_enabled_lock);
+	netfront_accel_ssr_init(&vnic->ssr_state);
+
+	init_waitqueue_head(&vnic->state_wait_queue);
+	vnic->backend_state = XenbusStateUnknown;
+	vnic->frontend_state = XenbusStateClosed;
+	vnic->removing = 0;
+	vnic->domU_state_is_setup = 0;
+	vnic->dom0_state_is_setup = 0;
+	vnic->poll_enabled = 0;
+	vnic->tx_enabled = 0;
+	vnic->tx_skb = NULL;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+	INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend);
+#else
+	INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend, vnic);
+#endif
+
+	netfront_accel_debugfs_create(vnic);
+
+	mutex_unlock(&vnic->vnic_mutex);
+
+	err = net_accel_xen_net_read_mac(dev, vnic->mac);
+	if (err) 
+		goto fail_mac;
+
+	/* Setup a watch on the frontend's MAC address */
+	err = setup_mac_address_watch(dev, vnic);
+	if (err)
+		goto fail_mac;
+
+	return vnic;
+
+fail_mac:
+
+	mutex_lock(&vnic->vnic_mutex);
+
+	netfront_accel_debugfs_remove(vnic);
+
+	netfront_accel_ssr_fini(vnic, &vnic->ssr_state);
+
+	EPRINTK_ON(vnic->tx_skb != NULL);
+
+	vnic->frontend_state = XenbusStateUnknown;
+	net_accel_update_state(dev, XenbusStateUnknown);
+
+	mutex_unlock(&vnic->vnic_mutex);
+
+	np->accel_priv = NULL;
+	kfree(vnic);
+
+	return ERR_PTR(err);
+}
+
+
+static void netfront_accel_vnic_dtor(netfront_accel_vnic *vnic)
+{
+	struct net_device *net_dev = vnic->net_dev;
+	struct netfront_info *np = 
+		(struct netfront_info *)netdev_priv(net_dev);
+
+	/*
+	 * Now we don't hold the lock any more it is safe to remove
+	 * this watch and synchonrise with the completion of
+	 * watches
+	 */
+	DPRINTK("%s: unregistering xenbus mac watch\n", __FUNCTION__);
+	unregister_xenbus_watch(&vnic->mac_address_watch);
+	kfree(vnic->mac_address_watch.node);
+
+	flush_workqueue(netfront_accel_workqueue);
+
+	mutex_lock(&vnic->vnic_mutex);
+
+	netfront_accel_debugfs_remove(vnic);
+
+	netfront_accel_ssr_fini(vnic, &vnic->ssr_state);
+
+	EPRINTK_ON(vnic->tx_skb != NULL);
+
+	vnic->frontend_state = XenbusStateUnknown;
+	net_accel_update_state(vnic->dev, XenbusStateUnknown);
+
+	mutex_unlock(&vnic->vnic_mutex);
+
+	np->accel_priv = NULL;
+	kfree(vnic);
+}
+
+
+static int vnic_setup_domU_shared_state(struct xenbus_device *dev,
+					netfront_accel_vnic *vnic)
+{
+	struct xenbus_transaction tr;
+	int err;
+	int msgs_per_queue;
+
+
+	DPRINTK("Setting up domU shared state.\n");
+
+	msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg);
+
+	/* Allocate buffer state */
+	vnic->tx_bufs = netfront_accel_init_bufs(&vnic->tx_lock);
+	if (vnic->tx_bufs == NULL) {
+		err = -ENOMEM;
+		EPRINTK("%s: Failed to allocate tx buffers\n", __FUNCTION__);
+		goto fail_tx_bufs;
+	}
+
+	vnic->rx_bufs = netfront_accel_init_bufs(NULL);
+	if (vnic->rx_bufs == NULL) {
+		err = -ENOMEM;
+		EPRINTK("%s: Failed to allocate rx buffers\n", __FUNCTION__);
+		goto fail_rx_bufs;
+	}
+
+	/* 
+	 * This allocates two pages, one for the shared page and one
+	 * for the message queue.
+	 */
+	vnic->shared_page = (struct net_accel_shared_page *)
+		__get_free_pages(GFP_KERNEL, 1);
+	if (vnic->shared_page == NULL) {
+		EPRINTK("%s: no memory for shared pages\n", __FUNCTION__);
+		err = -ENOMEM;
+		goto fail_shared_page;
+	}
+
+	net_accel_msg_init_queue
+		(&vnic->from_dom0, &vnic->shared_page->queue0, 
+		 (struct net_accel_msg *)((u8*)vnic->shared_page + PAGE_SIZE),
+		 msgs_per_queue);
+
+	net_accel_msg_init_queue
+		(&vnic->to_dom0, &vnic->shared_page->queue1,
+		 (struct net_accel_msg *)((u8*)vnic->shared_page +
+					  (3 * PAGE_SIZE / 2)),
+		 msgs_per_queue);
+	
+	vnic->msg_state = NETFRONT_ACCEL_MSG_NONE;
+
+	err = make_named_grant(dev, vnic->shared_page, "accel-ctrl-page",
+			       &vnic->ctrl_page_gnt);
+	if (err) {
+		EPRINTK("couldn't make ctrl-page named grant\n");
+		goto fail_ctrl_page_grant;
+	}
+
+	err = make_named_grant(dev, (u8*)vnic->shared_page + PAGE_SIZE,
+			       "accel-msg-page", &vnic->msg_page_gnt);
+	if (err) {
+		EPRINTK("couldn't make msg-page named grant\n");
+		goto fail_msg_page_grant;
+	}
+
+	/* Create xenbus msg event channel */
+	err = bind_listening_port_to_irqhandler
+		(dev->otherend_id, netfront_accel_msg_channel_irq_from_bend,
+		 IRQF_SAMPLE_RANDOM, "vnicctrl", vnic);
+	if (err < 0) {
+		EPRINTK("Couldn't bind msg event channel\n");
+		goto fail_msg_irq;
+	}
+	vnic->msg_channel_irq = err;
+	vnic->msg_channel = irq_to_evtchn_port(vnic->msg_channel_irq);
+	
+	/* Create xenbus net event channel */
+	err = bind_listening_port_to_irqhandler
+		(dev->otherend_id, netfront_accel_net_channel_irq_from_bend,
+		 IRQF_SAMPLE_RANDOM, "vnicfront", vnic);
+	if (err < 0) {
+		EPRINTK("Couldn't bind net event channel\n");
+		goto fail_net_irq;
+	}
+	vnic->net_channel_irq = err;
+	vnic->net_channel = irq_to_evtchn_port(vnic->net_channel_irq);
+	/* Want to ensure we don't get interrupts before we're ready */
+	netfront_accel_disable_net_interrupts(vnic);
+
+	DPRINTK("otherend %d has msg ch %u (%u) and net ch %u (%u)\n",
+		dev->otherend_id, vnic->msg_channel, vnic->msg_channel_irq, 
+		vnic->net_channel, vnic->net_channel_irq);
+
+	do {
+		err = xenbus_transaction_start(&tr);
+		if (err != 0) {
+			EPRINTK("%s: Transaction start failed %d\n",
+				__FUNCTION__, err);
+			goto fail_transaction;
+		}
+
+		err = xenbus_printf(tr, dev->nodename, "accel-msg-channel",
+				    "%u", vnic->msg_channel);
+		if (err != 0) {
+			EPRINTK("%s: event channel xenbus write failed %d\n",
+				__FUNCTION__, err);
+			xenbus_transaction_end(tr, 1);
+			goto fail_transaction;
+		}
+
+		err = xenbus_printf(tr, dev->nodename, "accel-net-channel",
+				    "%u", vnic->net_channel);
+		if (err != 0) {
+			EPRINTK("%s: net channel xenbus write failed %d\n",
+				__FUNCTION__, err);
+			xenbus_transaction_end(tr, 1);
+			goto fail_transaction;
+		}
+
+		err = xenbus_transaction_end(tr, 0);
+	} while (err == -EAGAIN);
+
+	if (err != 0) {
+		EPRINTK("%s: Transaction end failed %d\n", __FUNCTION__, err);
+		goto fail_transaction;
+	}
+
+	DPRINTK("Completed setting up domU shared state\n");
+
+	return 0;
+
+fail_transaction:
+
+	unbind_from_irqhandler(vnic->net_channel_irq, vnic);
+fail_net_irq:
+
+	unbind_from_irqhandler(vnic->msg_channel_irq, vnic);
+fail_msg_irq:
+
+	remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt);
+fail_msg_page_grant:
+
+	remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt);
+fail_ctrl_page_grant:
+
+	free_pages((unsigned long)vnic->shared_page, 1);
+	vnic->shared_page = NULL;
+fail_shared_page:
+
+	netfront_accel_fini_bufs(vnic->rx_bufs);
+fail_rx_bufs:
+
+	netfront_accel_fini_bufs(vnic->tx_bufs);
+fail_tx_bufs:
+
+	/* Undo the memory allocation created when we got the HELLO */
+	netfront_accel_free_buffer_mem(&vnic->bufpages,
+				       vnic->rx_bufs,
+				       vnic->tx_bufs);
+
+	DPRINTK("Failed to setup domU shared state with code %d\n", err);
+
+	return err;
+}
+
+
+static void vnic_remove_domU_shared_state(struct xenbus_device *dev, 
+					  netfront_accel_vnic *vnic)
+{
+	struct xenbus_transaction tr;
+	
+	/*
+	 * Don't remove any watches because we currently hold the
+	 * mutex and the watches take the mutex.
+	 */
+
+	DPRINTK("%s: removing event channel irq handlers %d %d\n",
+		__FUNCTION__, vnic->net_channel_irq, vnic->msg_channel_irq);
+	do {
+		if (xenbus_transaction_start(&tr) != 0)
+			break;
+		xenbus_rm(tr, dev->nodename, "accel-msg-channel");
+		xenbus_rm(tr, dev->nodename, "accel-net-channel");
+	} while (xenbus_transaction_end(tr, 0) == -EAGAIN);
+
+	unbind_from_irqhandler(vnic->net_channel_irq, vnic);
+	unbind_from_irqhandler(vnic->msg_channel_irq, vnic);
+
+	/* ungrant pages for msg channel */
+	remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt);
+	remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt);
+	free_pages((unsigned long)vnic->shared_page, 1);
+	vnic->shared_page = NULL;
+
+	/* ungrant pages for buffers, and free buffer memory */
+	netfront_accel_free_buffer_mem(&vnic->bufpages,
+				       vnic->rx_bufs,
+				       vnic->tx_bufs);
+	netfront_accel_fini_bufs(vnic->rx_bufs);
+	netfront_accel_fini_bufs(vnic->tx_bufs);
+}
+
+
+static void vnic_setup_dom0_shared_state(struct xenbus_device *dev,
+					netfront_accel_vnic *vnic)
+{
+	DPRINTK("Setting up dom0 shared state\n");
+
+	netfront_accel_vi_ctor(vnic);
+
+	/*
+	 * Message processing will be enabled when this function
+	 * returns, but we might have missed an interrupt.  Schedule a
+	 * check just in case.
+	 */
+	queue_work(netfront_accel_workqueue, &vnic->msg_from_bend);
+}
+
+
+static void vnic_remove_dom0_shared_state(struct xenbus_device *dev,
+					  netfront_accel_vnic *vnic)
+{
+	DPRINTK("Removing dom0 shared state\n");
+
+	vnic_stop_fastpath(vnic);
+
+	netfront_accel_vi_dtor(vnic);
+}
+
+
+/*************************************************************************/
+
+/*
+ * The following code handles accelstate changes between the frontend
+ * and the backend.  In response to transitions, calls the following
+ * functions in matching pairs:
+ *
+ *   vnic_setup_domU_shared_state
+ *   vnic_remove_domU_shared_state
+ *
+ *   vnic_setup_dom0_shared_state
+ *   vnic_remove_dom0_shared_state
+ *
+ * Valid state transitions for DomU are as follows:
+ *
+ * Closed->Init       on probe or in response to Init from dom0
+ *
+ * Init->Connected    in response to Init from dom0
+ * Init->Closing      on error providing dom0 is in Init
+ * Init->Closed       on remove or in response to Closing from dom0
+ *
+ * Connected->Closing on error/remove
+ * Connected->Closed  in response to Closing from dom0
+ *
+ * Closing->Closed    in response to Closing from dom0
+ *
+ */
+
+
+/* Function to deal with Xenbus accel state change in backend */
+static void netfront_accel_backend_accel_changed(netfront_accel_vnic *vnic,
+						 XenbusState backend_state)
+{
+	struct xenbus_device *dev = vnic->dev;
+	XenbusState frontend_state;
+	int state;
+
+	DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n",
+		__FUNCTION__, xenbus_strstate(vnic->backend_state),
+		xenbus_strstate(backend_state), dev->nodename, dev->otherend);
+
+	/*
+	 * Ignore duplicate state changes.  This can happen if the
+	 * backend changes state twice in quick succession and the
+	 * first watch fires in the frontend after the second
+	 * transition has completed.
+	 */
+	if (vnic->backend_state == backend_state)
+		return;
+
+	vnic->backend_state = backend_state;
+	frontend_state = vnic->frontend_state;
+
+	switch (backend_state) {
+	case XenbusStateInitialising:
+		/*
+		 * It's possible for us to miss the closed state from
+		 * dom0, so do the work here.
+		 */
+		if (vnic->domU_state_is_setup) {
+			vnic_remove_domU_shared_state(dev, vnic);
+			vnic->domU_state_is_setup = 0;
+		}
+
+		if (frontend_state != XenbusStateInitialising) {
+			/* Make sure the backend doesn't go away. */
+			frontend_state = XenbusStateInitialising;
+			net_accel_update_state(dev, frontend_state);
+			xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state);
+			backend_state = (XenbusState)state;
+			if (backend_state != XenbusStateInitialising)
+				break;
+		}
+
+		/* Start the new connection. */
+		if (!vnic->removing) {
+			BUG_ON(vnic->domU_state_is_setup);
+			if (vnic_setup_domU_shared_state(dev, vnic) == 0) {
+				vnic->domU_state_is_setup = 1;
+				frontend_state = XenbusStateConnected;
+			} else
+				frontend_state = XenbusStateClosing;
+		}
+		break;
+	case XenbusStateConnected:
+		if (vnic->domU_state_is_setup &&
+		    !vnic->dom0_state_is_setup) {
+			vnic_setup_dom0_shared_state(dev, vnic);
+			vnic->dom0_state_is_setup = 1;
+		}
+		break;
+	default:
+	case XenbusStateClosing:
+		if (vnic->dom0_state_is_setup) {
+			vnic_remove_dom0_shared_state(dev, vnic);
+			vnic->dom0_state_is_setup = 0;
+		}
+		frontend_state = XenbusStateClosed;
+		break;
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		if (vnic->domU_state_is_setup) {
+			vnic_remove_domU_shared_state(dev, vnic);
+			vnic->domU_state_is_setup = 0;
+		}
+		break;
+	}
+
+	if (frontend_state != vnic->frontend_state) {
+		DPRINTK("Switching from state %s (%d) to %s (%d)\n",
+			xenbus_strstate(vnic->frontend_state),
+			vnic->frontend_state,
+			xenbus_strstate(frontend_state), frontend_state);
+		vnic->frontend_state = frontend_state;
+		net_accel_update_state(dev, frontend_state);
+	}
+
+	wake_up(&vnic->state_wait_queue);
+}
+
+
+static void backend_accel_state_change(struct xenbus_watch *watch,
+				       const char **vec, unsigned int len)
+{
+	int state;
+	netfront_accel_vnic *vnic;
+	struct xenbus_device *dev;
+
+	DPRINTK("%s\n", __FUNCTION__);
+
+	vnic = container_of(watch, struct netfront_accel_vnic,
+				backend_accel_watch);
+
+	mutex_lock(&vnic->vnic_mutex);
+
+	dev = vnic->dev;
+
+	state = (int)XenbusStateUnknown;
+	xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state);
+	netfront_accel_backend_accel_changed(vnic, state);
+
+	mutex_unlock(&vnic->vnic_mutex);
+}
+
+
+static int setup_dom0_accel_watch(struct xenbus_device *dev,
+				  netfront_accel_vnic *vnic)
+{
+	int err;
+
+	DPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate");
+
+	err = xenbus_watch_path2(dev, dev->otherend, "accelstate", 
+				 &vnic->backend_accel_watch, 
+				 backend_accel_state_change);
+	if (err) {
+		EPRINTK("%s: Failed to register xenbus watch: %d\n",
+			__FUNCTION__, err);
+		goto fail;
+	}
+	return 0;
+ fail:
+	vnic->backend_accel_watch.node = NULL;
+	return err;
+}
+
+
+int netfront_accel_probe(struct net_device *net_dev, struct xenbus_device *dev)
+{
+	netfront_accel_vnic *vnic;
+	int err;
+
+	DPRINTK("Probe passed device %s\n", dev->nodename);
+
+	vnic = netfront_accel_vnic_ctor(net_dev, dev);
+	if (IS_ERR(vnic))
+		return PTR_ERR(vnic);
+
+	/*
+	 * Setup a watch on the backend accel state.  This sets things
+	 * going.
+	 */
+	err = setup_dom0_accel_watch(dev, vnic);
+	if (err) {
+		netfront_accel_vnic_dtor(vnic);
+		EPRINTK("%s: probe failed with code %d\n", __FUNCTION__, err);
+		return err;
+	}
+
+	/*
+	 * Indicate to the other end that we're ready to start unless
+	 * the watch has already fired.
+	 */
+	mutex_lock(&vnic->vnic_mutex);
+	VPRINTK("setup success, updating accelstate\n");
+	if (vnic->frontend_state == XenbusStateClosed) {
+		vnic->frontend_state = XenbusStateInitialising;
+		net_accel_update_state(dev, XenbusStateInitialising);
+	}
+	mutex_unlock(&vnic->vnic_mutex);
+
+	DPRINTK("Probe done device %s\n", dev->nodename);
+
+	return 0;
+}
+
+
+int netfront_accel_remove(struct xenbus_device *dev)
+{
+	struct netfront_info *np = dev_get_drvdata(&dev->dev);
+	netfront_accel_vnic *vnic = (netfront_accel_vnic *)np->accel_priv;
+
+	DPRINTK("%s %s\n", __FUNCTION__, dev->nodename);
+
+	BUG_ON(vnic == NULL);
+
+	mutex_lock(&vnic->vnic_mutex);
+
+	/* Reject any attempts to connect. */
+	vnic->removing = 1;
+
+	/* Close any existing connection. */
+	if (vnic->frontend_state == XenbusStateConnected) {
+		vnic->frontend_state = XenbusStateClosing;
+		net_accel_update_state(dev, XenbusStateClosing);
+	}
+
+	mutex_unlock(&vnic->vnic_mutex);
+
+	DPRINTK("%s waiting for release of %s\n", __FUNCTION__, dev->nodename);
+
+	/*
+	 * Wait for the xenbus watch to release the shared resources.
+	 * This indicates that dom0 has made the transition
+	 * Closing->Closed or that dom0 was in Closed or Init and no
+	 * resources were mapped.
+	 */
+	wait_event(vnic->state_wait_queue,
+		   !vnic->domU_state_is_setup);
+
+	/*
+	 * Now we don't need this watch anymore it is safe to remove
+	 * it (and so synchronise with it completing if outstanding)
+	 */
+	DPRINTK("%s: unregistering xenbus accel watch\n",
+		__FUNCTION__);
+	unregister_xenbus_watch(&vnic->backend_accel_watch);
+	kfree(vnic->backend_accel_watch.node);
+
+	netfront_accel_vnic_dtor(vnic);
+
+	DPRINTK("%s done %s\n", __FUNCTION__, dev->nodename);
+
+	return 0;
+}
diff --git a/drivers/xen/sfc_netfront/ef_vi_falcon.h b/drivers/xen/sfc_netfront/ef_vi_falcon.h
new file mode 100644
index 0000000..9aaf4ca
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon.h
@@ -0,0 +1,172 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  slp
+ *  \brief  Falcon specific definitions
+ *   \date  2004/08
+ */
+
+#ifndef __EF_VI_FALCON_H__
+#define __EF_VI_FALCON_H__    
+
+#define EFHW_4K		0x00001000u
+#define EFHW_8K		0x00002000u
+
+/* include the autogenerated register definitions */
+
+#include "ef_vi_falcon_core.h"
+#include "ef_vi_falcon_desc.h"
+#include "ef_vi_falcon_event.h"
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Helpers to turn bit shifts into dword shifts and check that the bit fields 
+ * haven't overflown the dword etc. Aim is to preserve consistency with the 
+ * autogenerated headers - once stable we could hard code.
+ *
+ *---------------------------------------------------------------------------*/
+
+/* mask constructors */
+#define __FALCON_MASK(WIDTH,T)  ((((T)1) << (WIDTH)) - 1)
+#define __EFVI_MASK32(WIDTH)  __FALCON_MASK((WIDTH),uint32_t)
+#define __EFVI_MASK64(WIDTH)  __FALCON_MASK((WIDTH),uint64_t)
+
+#define __EFVI_FALCON_MASKFIELD32(LBN, WIDTH)   ((uint32_t)  \
+                             (__EFVI_MASK32(WIDTH) << (LBN)))
+
+/* constructors for fields which span the first and second dwords */
+#define __LW(LBN) (32 - LBN)
+#define LOW(v, LBN, WIDTH)   ((uint32_t)  \
+                               (((v) & __EFVI_MASK64(__LW((LBN)))) << (LBN)))
+#define HIGH(v, LBN, WIDTH)  ((uint32_t)(((v) >> __LW((LBN))) & \
+                                       __EFVI_MASK64((WIDTH - __LW((LBN))))))
+/* constructors for fields within the second dword */
+#define __DW2(LBN) 	  ((LBN) - 32)
+
+/* constructors for fields which span the second and third dwords */
+#define __LW2(LBN) (64 - LBN)
+#define LOW2(v, LBN, WIDTH) ((uint32_t) \
+                       (((v) & __EFVI_MASK64(__LW2((LBN)))) << ((LBN) - 32)))
+#define HIGH2(v, LBN, WIDTH)  ((uint32_t) \
+             (((v) >> __LW2((LBN))) & __EFVI_MASK64((WIDTH - __LW2((LBN))))))
+
+/* constructors for fields within the third dword */
+#define __DW3(LBN) 	  ((LBN) - 64)
+
+				
+/* constructors for fields which span the third and fourth dwords */
+#define __LW3(LBN) (96 - LBN)
+#define LOW3(v, LBN, WIDTH)   ((uint32_t)    \
+              (((v) & __EFVI_MASK64(__LW3((LBN)))) << ((LBN) - 64)))
+#define HIGH3(v, LBN, WIDTH)  ((unit32_t)    \
+             (((v) >> __LW3((LBN))) & __EFVI_MASK64((WIDTH - __LW3((LBN))))))
+
+/* constructors for fields within the fourth dword */
+#define __DW4(LBN) 	  ((LBN) - 96)
+
+/* checks that the autogenerated headers our consistent with our model */
+#define WIDTHCHCK(a, b) ef_assert((a) == (b))
+#define RANGECHCK(v, WIDTH) \
+                ef_assert(((uint64_t)(v) & ~(__EFVI_MASK64((WIDTH)))) == 0)
+
+/* fields within the first dword */
+#define DWCHCK(LBN, WIDTH) ef_assert(((LBN) >= 0) &&(((LBN)+(WIDTH)) <= 32))
+
+/* fields which span the first and second dwords */
+#define LWCHK(LBN, WIDTH)  ef_assert(WIDTH >= __LW(LBN))
+
+/*----------------------------------------------------------------------------
+ *
+ * Buffer virtual addresses (4K buffers) 
+ *
+ *---------------------------------------------------------------------------*/
+
+/* Form a buffer virtual address from buffer ID and offset.  If the offset
+** is larger than the buffer size, then the buffer indexed will be
+** calculated appropriately.  It is the responsibility of the caller to
+** ensure that they have valid buffers programmed at that address.
+*/
+#define EFVI_FALCON_VADDR_4K_S   	(12)	     
+#define EFVI_FALCON_VADDR_M       0xfffff		/* post shift mask  */
+
+
+#define EFVI_FALCON_BUFFER_4K_ADDR(id,off)      \
+  (((id) << EFVI_FALCON_VADDR_4K_S) + (off))
+
+#define EFVI_FALCON_BUFFER_4K_PAGE(vaddr)                       \
+  (((vaddr) >> EFVI_FALCON_VADDR_4K_S) & EFVI_FALCON_VADDR_M)
+
+#define EFVI_FALCON_BUFFER_4K_OFF(vaddr)                \
+  ((vaddr) & __EFVI_MASK32(EFVI_FALCON_VADDR_4K_S))
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Masks
+ *
+ *---------------------------------------------------------------------------*/
+
+#define EFVI_FALCON_CLOCK_ASIC_HZ    (125000)
+#define EFVI_FALCON_CLOCK_FPGA_HZ    (62500)
+#define EFVI_FALCON_CLOCK_HZ         EFVI_FALCON_CLOCK_ASIC_HZ
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Timers
+ *
+ *---------------------------------------------------------------------------*/
+
+/* Event-Queue Timer granularity - measured in us 
+   Given by: 4096 * 3 cycle * clock period */
+
+#define EFVI_FALCON_EVQTIMER_PERIOD_US   ((4096 * 3 * 1000) / EFVI_FALCON_CLOCK_HZ)
+
+/* mode bits */
+#define EFVI_FALCON_TIMER_MODE_DIS     0     /* disabled */
+#define EFVI_FALCON_TIMER_MODE_RUN     1     /* started counting right away */
+#define EFVI_FALCON_TIMER_MODE_HOLD    2     /* trigger mode (user queues) */
+
+#define EFVI_FALCON_EVQTIMER_HOLD     (EFVI_FALCON_TIMER_MODE_HOLD << TIMER_MODE_LBN)
+#define EFVI_FALCON_EVQTIMER_RUN      (EFVI_FALCON_TIMER_MODE_RUN  << TIMER_MODE_LBN)
+#define EFVI_FALCON_EVQTIMER_DISABLE  (EFVI_FALCON_TIMER_MODE_DIS  << TIMER_MODE_LBN) 
+
+
+/* ---- ef_vi_event helpers --- */
+
+#define EFVI_FALCON_EVENT_CODE(evp) \
+       ((evp)->u64 & EFVI_FALCON_EVENT_CODE_MASK)
+
+#define EFVI_FALCON_EVENT_SW_DATA_MASK    0x0000ffff
+
+#define __EFVI_FALCON_OPEN_MASK(WIDTH)  ((((uint64_t)1) << (WIDTH)) - 1)
+
+#define EFVI_FALCON_EVENT_CODE_MASK \
+           (__EFVI_FALCON_OPEN_MASK(EV_CODE_WIDTH) << EV_CODE_LBN)
+
+
+#endif  /* __EF_VI_FALCON_H__ */
diff --git a/drivers/xen/sfc_netfront/ef_vi_falcon_core.h b/drivers/xen/sfc_netfront/ef_vi_falcon_core.h
new file mode 100644
index 0000000..089f42a
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon_core.h
@@ -0,0 +1,1075 @@
+
+#define  EFVI_FALCON_EXTENDED_P_BAR 1
+
+//////////////---- Bus Interface Unit Registers C Header ----//////////////
+#define IOM_IND_ADR_REG_OFST 0x0 // IO-mapped indirect access address register
+  #define IOM_AUTO_ADR_INC_EN_LBN 16
+  #define IOM_AUTO_ADR_INC_EN_WIDTH 1
+  #define IOM_IND_ADR_LBN 0
+  #define IOM_IND_ADR_WIDTH 16
+#define IOM_IND_DAT_REG_OFST 0x4 // IO-mapped indirect access data register
+  #define IOM_IND_DAT_LBN 0
+  #define IOM_IND_DAT_WIDTH 32
+#define ADR_REGION_REG_KER_OFST 0x0 // Address region register
+#define ADR_REGION_REG_OFST 0x0 // Address region register
+  #define ADR_REGION3_LBN 96
+  #define ADR_REGION3_WIDTH 18
+  #define ADR_REGION2_LBN 64
+  #define ADR_REGION2_WIDTH 18
+  #define ADR_REGION1_LBN 32
+  #define ADR_REGION1_WIDTH 18
+  #define ADR_REGION0_LBN 0
+  #define ADR_REGION0_WIDTH 18
+#define INT_EN_REG_KER_OFST 0x10 // Kernel driver Interrupt enable register
+  #define KER_INT_CHAR_LBN 4
+  #define KER_INT_CHAR_WIDTH 1
+  #define KER_INT_KER_LBN 3
+  #define KER_INT_KER_WIDTH 1
+  #define ILL_ADR_ERR_INT_EN_KER_LBN 2
+  #define ILL_ADR_ERR_INT_EN_KER_WIDTH 1
+  #define SRM_PERR_INT_EN_KER_LBN 1
+  #define SRM_PERR_INT_EN_KER_WIDTH 1
+  #define DRV_INT_EN_KER_LBN 0
+  #define DRV_INT_EN_KER_WIDTH 1
+#define INT_EN_REG_CHAR_OFST 0x20 // Char Driver interrupt enable register
+  #define CHAR_INT_CHAR_LBN 4
+  #define CHAR_INT_CHAR_WIDTH 1
+  #define CHAR_INT_KER_LBN 3
+  #define CHAR_INT_KER_WIDTH 1
+  #define ILL_ADR_ERR_INT_EN_CHAR_LBN 2
+  #define ILL_ADR_ERR_INT_EN_CHAR_WIDTH 1
+  #define SRM_PERR_INT_EN_CHAR_LBN 1
+  #define SRM_PERR_INT_EN_CHAR_WIDTH 1
+  #define DRV_INT_EN_CHAR_LBN 0
+  #define DRV_INT_EN_CHAR_WIDTH 1
+#define INT_ADR_REG_KER_OFST 0x30 // Interrupt host address for Kernel driver
+  #define INT_ADR_KER_LBN 0
+  #define INT_ADR_KER_WIDTH 64
+  #define DRV_INT_KER_LBN 32
+  #define DRV_INT_KER_WIDTH 1
+  #define EV_FF_HALF_INT_KER_LBN 3
+  #define EV_FF_HALF_INT_KER_WIDTH 1
+  #define EV_FF_FULL_INT_KER_LBN 2
+  #define EV_FF_FULL_INT_KER_WIDTH 1
+  #define ILL_ADR_ERR_INT_KER_LBN 1
+  #define ILL_ADR_ERR_INT_KER_WIDTH 1
+  #define SRAM_PERR_INT_KER_LBN 0
+  #define SRAM_PERR_INT_KER_WIDTH 1
+#define INT_ADR_REG_CHAR_OFST 0x40 // Interrupt host address for Char driver
+  #define INT_ADR_CHAR_LBN 0
+  #define INT_ADR_CHAR_WIDTH 64
+  #define DRV_INT_CHAR_LBN 32
+  #define DRV_INT_CHAR_WIDTH 1
+  #define EV_FF_HALF_INT_CHAR_LBN 3
+  #define EV_FF_HALF_INT_CHAR_WIDTH 1
+  #define EV_FF_FULL_INT_CHAR_LBN 2
+  #define EV_FF_FULL_INT_CHAR_WIDTH 1
+  #define ILL_ADR_ERR_INT_CHAR_LBN 1
+  #define ILL_ADR_ERR_INT_CHAR_WIDTH 1
+  #define SRAM_PERR_INT_CHAR_LBN 0
+  #define SRAM_PERR_INT_CHAR_WIDTH 1
+#define INT_ISR0_B0_OFST 0x90 // B0 only
+#define INT_ISR1_B0_OFST 0xA0
+#define INT_ACK_REG_KER_A1_OFST 0x50 // Kernel interrupt acknowledge register
+  #define RESERVED_LBN 0
+  #define RESERVED_WIDTH 32
+#define INT_ACK_REG_CHAR_A1_OFST 0x60 // CHAR interrupt acknowledge register
+  #define RESERVED_LBN 0
+  #define RESERVED_WIDTH 32
+//////////////---- Global CSR Registers C Header ----//////////////
+#define STRAP_REG_KER_OFST 0x200 // ASIC strap status register
+#define STRAP_REG_OFST 0x200 // ASIC strap status register
+  #define ONCHIP_SRAM_LBN 16
+  #define ONCHIP_SRAM_WIDTH 0
+  #define STRAP_ISCSI_EN_LBN 3
+  #define STRAP_ISCSI_EN_WIDTH 1
+  #define STRAP_PINS_LBN 0
+  #define STRAP_PINS_WIDTH 3
+#define GPIO_CTL_REG_KER_OFST 0x210 // GPIO control register
+#define GPIO_CTL_REG_OFST 0x210 // GPIO control register
+  #define GPIO_OEN_LBN 24
+  #define GPIO_OEN_WIDTH 4
+  #define GPIO_OUT_LBN 16
+  #define GPIO_OUT_WIDTH 4
+  #define GPIO_IN_LBN 8
+  #define GPIO_IN_WIDTH 4
+  #define GPIO_PWRUP_VALUE_LBN 0
+  #define GPIO_PWRUP_VALUE_WIDTH 4
+#define GLB_CTL_REG_KER_OFST 0x220 // Global control register
+#define GLB_CTL_REG_OFST 0x220 // Global control register
+  #define SWRST_LBN 0
+  #define SWRST_WIDTH 1
+#define FATAL_INTR_REG_KER_OFST 0x230 // Fatal interrupt register for Kernel
+  #define PCI_BUSERR_INT_KER_EN_LBN 43
+  #define PCI_BUSERR_INT_KER_EN_WIDTH 1
+  #define SRAM_OOB_INT_KER_EN_LBN 42
+  #define SRAM_OOB_INT_KER_EN_WIDTH 1
+  #define BUFID_OOB_INT_KER_EN_LBN 41
+  #define BUFID_OOB_INT_KER_EN_WIDTH 1
+  #define MEM_PERR_INT_KER_EN_LBN 40
+  #define MEM_PERR_INT_KER_EN_WIDTH 1
+  #define RBUF_OWN_INT_KER_EN_LBN 39
+  #define RBUF_OWN_INT_KER_EN_WIDTH 1
+  #define TBUF_OWN_INT_KER_EN_LBN 38
+  #define TBUF_OWN_INT_KER_EN_WIDTH 1
+  #define RDESCQ_OWN_INT_KER_EN_LBN 37
+  #define RDESCQ_OWN_INT_KER_EN_WIDTH 1
+  #define TDESCQ_OWN_INT_KER_EN_LBN 36
+  #define TDESCQ_OWN_INT_KER_EN_WIDTH 1
+  #define EVQ_OWN_INT_KER_EN_LBN 35
+  #define EVQ_OWN_INT_KER_EN_WIDTH 1
+  #define EVFF_OFLO_INT_KER_EN_LBN 34
+  #define EVFF_OFLO_INT_KER_EN_WIDTH 1
+  #define ILL_ADR_INT_KER_EN_LBN 33
+  #define ILL_ADR_INT_KER_EN_WIDTH 1
+  #define SRM_PERR_INT_KER_EN_LBN 32
+  #define SRM_PERR_INT_KER_EN_WIDTH 1
+  #define PCI_BUSERR_INT_KER_LBN 11
+  #define PCI_BUSERR_INT_KER_WIDTH 1
+  #define SRAM_OOB_INT_KER_LBN 10
+  #define SRAM_OOB_INT_KER_WIDTH 1
+  #define BUFID_OOB_INT_KER_LBN 9
+  #define BUFID_OOB_INT_KER_WIDTH 1
+  #define MEM_PERR_INT_KER_LBN 8
+  #define MEM_PERR_INT_KER_WIDTH 1
+  #define RBUF_OWN_INT_KER_LBN 7
+  #define RBUF_OWN_INT_KER_WIDTH 1
+  #define TBUF_OWN_INT_KER_LBN 6
+  #define TBUF_OWN_INT_KER_WIDTH 1
+  #define RDESCQ_OWN_INT_KER_LBN 5
+  #define RDESCQ_OWN_INT_KER_WIDTH 1
+  #define TDESCQ_OWN_INT_KER_LBN 4
+  #define TDESCQ_OWN_INT_KER_WIDTH 1
+  #define EVQ_OWN_INT_KER_LBN 3
+  #define EVQ_OWN_INT_KER_WIDTH 1
+  #define EVFF_OFLO_INT_KER_LBN 2
+  #define EVFF_OFLO_INT_KER_WIDTH 1
+  #define ILL_ADR_INT_KER_LBN 1
+  #define ILL_ADR_INT_KER_WIDTH 1
+  #define SRM_PERR_INT_KER_LBN 0
+  #define SRM_PERR_INT_KER_WIDTH 1
+#define FATAL_INTR_REG_OFST 0x240 // Fatal interrupt register for Char
+  #define PCI_BUSERR_INT_CHAR_EN_LBN 43
+  #define PCI_BUSERR_INT_CHAR_EN_WIDTH 1
+  #define SRAM_OOB_INT_CHAR_EN_LBN 42
+  #define SRAM_OOB_INT_CHAR_EN_WIDTH 1
+  #define BUFID_OOB_INT_CHAR_EN_LBN 41
+  #define BUFID_OOB_INT_CHAR_EN_WIDTH 1
+  #define MEM_PERR_INT_CHAR_EN_LBN 40
+  #define MEM_PERR_INT_CHAR_EN_WIDTH 1
+  #define RBUF_OWN_INT_CHAR_EN_LBN 39
+  #define RBUF_OWN_INT_CHAR_EN_WIDTH 1
+  #define TBUF_OWN_INT_CHAR_EN_LBN 38
+  #define TBUF_OWN_INT_CHAR_EN_WIDTH 1
+  #define RDESCQ_OWN_INT_CHAR_EN_LBN 37
+  #define RDESCQ_OWN_INT_CHAR_EN_WIDTH 1
+  #define TDESCQ_OWN_INT_CHAR_EN_LBN 36
+  #define TDESCQ_OWN_INT_CHAR_EN_WIDTH 1
+  #define EVQ_OWN_INT_CHAR_EN_LBN 35
+  #define EVQ_OWN_INT_CHAR_EN_WIDTH 1
+  #define EVFF_OFLO_INT_CHAR_EN_LBN 34
+  #define EVFF_OFLO_INT_CHAR_EN_WIDTH 1
+  #define ILL_ADR_INT_CHAR_EN_LBN 33
+  #define ILL_ADR_INT_CHAR_EN_WIDTH 1
+  #define SRM_PERR_INT_CHAR_EN_LBN 32
+  #define SRM_PERR_INT_CHAR_EN_WIDTH 1
+  #define FATAL_INTR_REG_EN_BITS    0xffffffffffffffffULL
+  #define PCI_BUSERR_INT_CHAR_LBN 11
+  #define PCI_BUSERR_INT_CHAR_WIDTH 1
+  #define SRAM_OOB_INT_CHAR_LBN 10
+  #define SRAM_OOB_INT_CHAR_WIDTH 1
+  #define BUFID_OOB_INT_CHAR_LBN 9
+  #define BUFID_OOB_INT_CHAR_WIDTH 1
+  #define MEM_PERR_INT_CHAR_LBN 8
+  #define MEM_PERR_INT_CHAR_WIDTH 1
+  #define RBUF_OWN_INT_CHAR_LBN 7
+  #define RBUF_OWN_INT_CHAR_WIDTH 1
+  #define TBUF_OWN_INT_CHAR_LBN 6
+  #define TBUF_OWN_INT_CHAR_WIDTH 1
+  #define RDESCQ_OWN_INT_CHAR_LBN 5
+  #define RDESCQ_OWN_INT_CHAR_WIDTH 1
+  #define TDESCQ_OWN_INT_CHAR_LBN 4
+  #define TDESCQ_OWN_INT_CHAR_WIDTH 1
+  #define EVQ_OWN_INT_CHAR_LBN 3
+  #define EVQ_OWN_INT_CHAR_WIDTH 1
+  #define EVFF_OFLO_INT_CHAR_LBN 2
+  #define EVFF_OFLO_INT_CHAR_WIDTH 1
+  #define ILL_ADR_INT_CHAR_LBN 1
+  #define ILL_ADR_INT_CHAR_WIDTH 1
+  #define SRM_PERR_INT_CHAR_LBN 0
+  #define SRM_PERR_INT_CHAR_WIDTH 1
+#define DP_CTRL_REG_OFST 0x250 // Datapath control register
+  #define FLS_EVQ_ID_LBN 0
+  #define FLS_EVQ_ID_WIDTH 12
+#define MEM_STAT_REG_KER_OFST 0x260 // Memory status register
+#define MEM_STAT_REG_OFST 0x260 // Memory status register
+  #define MEM_PERR_VEC_LBN 53
+  #define MEM_PERR_VEC_WIDTH 38
+  #define MBIST_CORR_LBN 38
+  #define MBIST_CORR_WIDTH 15
+  #define MBIST_ERR_LBN 0
+  #define MBIST_ERR_WIDTH 38
+#define DEBUG_REG_KER_OFST 0x270 // Debug register
+#define DEBUG_REG_OFST 0x270 // Debug register
+  #define DEBUG_BLK_SEL2_LBN 47
+  #define DEBUG_BLK_SEL2_WIDTH 3
+  #define DEBUG_BLK_SEL1_LBN 44
+  #define DEBUG_BLK_SEL1_WIDTH 3
+  #define DEBUG_BLK_SEL0_LBN 41
+  #define DEBUG_BLK_SEL0_WIDTH 3
+  #define MISC_DEBUG_ADDR_LBN 36
+  #define MISC_DEBUG_ADDR_WIDTH 5
+  #define SERDES_DEBUG_ADDR_LBN 31
+  #define SERDES_DEBUG_ADDR_WIDTH 5
+  #define EM_DEBUG_ADDR_LBN 26
+  #define EM_DEBUG_ADDR_WIDTH 5
+  #define SR_DEBUG_ADDR_LBN 21
+  #define SR_DEBUG_ADDR_WIDTH 5
+  #define EV_DEBUG_ADDR_LBN 16
+  #define EV_DEBUG_ADDR_WIDTH 5
+  #define RX_DEBUG_ADDR_LBN 11
+  #define RX_DEBUG_ADDR_WIDTH 5
+  #define TX_DEBUG_ADDR_LBN 6
+  #define TX_DEBUG_ADDR_WIDTH 5
+  #define BIU_DEBUG_ADDR_LBN 1
+  #define BIU_DEBUG_ADDR_WIDTH 5
+  #define DEBUG_EN_LBN 0
+  #define DEBUG_EN_WIDTH 1
+#define DRIVER_REG0_KER_OFST 0x280 // Driver scratch register 0
+#define DRIVER_REG0_OFST 0x280 // Driver scratch register 0
+  #define DRIVER_DW0_LBN 0
+  #define DRIVER_DW0_WIDTH 32
+#define DRIVER_REG1_KER_OFST 0x290 // Driver scratch register 1
+#define DRIVER_REG1_OFST 0x290 // Driver scratch register 1
+  #define DRIVER_DW1_LBN 0
+  #define DRIVER_DW1_WIDTH 32
+#define DRIVER_REG2_KER_OFST 0x2A0 // Driver scratch register 2
+#define DRIVER_REG2_OFST 0x2A0 // Driver scratch register 2
+  #define DRIVER_DW2_LBN 0
+  #define DRIVER_DW2_WIDTH 32
+#define DRIVER_REG3_KER_OFST 0x2B0 // Driver scratch register 3
+#define DRIVER_REG3_OFST 0x2B0 // Driver scratch register 3
+  #define DRIVER_DW3_LBN 0
+  #define DRIVER_DW3_WIDTH 32
+#define DRIVER_REG4_KER_OFST 0x2C0 // Driver scratch register 4
+#define DRIVER_REG4_OFST 0x2C0 // Driver scratch register 4
+  #define DRIVER_DW4_LBN 0
+  #define DRIVER_DW4_WIDTH 32
+#define DRIVER_REG5_KER_OFST 0x2D0 // Driver scratch register 5
+#define DRIVER_REG5_OFST 0x2D0 // Driver scratch register 5
+  #define DRIVER_DW5_LBN 0
+  #define DRIVER_DW5_WIDTH 32
+#define DRIVER_REG6_KER_OFST 0x2E0 // Driver scratch register 6
+#define DRIVER_REG6_OFST 0x2E0 // Driver scratch register 6
+  #define DRIVER_DW6_LBN 0
+  #define DRIVER_DW6_WIDTH 32
+#define DRIVER_REG7_KER_OFST 0x2F0 // Driver scratch register 7
+#define DRIVER_REG7_OFST 0x2F0 // Driver scratch register 7
+  #define DRIVER_DW7_LBN 0
+  #define DRIVER_DW7_WIDTH 32
+#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register
+#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register
+  #define ALTERA_BUILD_VER_LBN 0
+  #define ALTERA_BUILD_VER_WIDTH 32
+
+/* so called CSR spare register 
+    - contains separate parity enable bits for the various internal memory blocks */
+#define MEM_PARITY_ERR_EN_REG_KER 0x310 
+#define MEM_PARITY_ALL_BLOCKS_EN_LBN 64
+#define MEM_PARITY_ALL_BLOCKS_EN_WIDTH 38
+#define MEM_PARITY_TX_DATA_EN_LBN   72
+#define MEM_PARITY_TX_DATA_EN_WIDTH 2
+
+//////////////---- Event & Timer Module Registers C Header ----//////////////
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define EVQ_RPTR_REG_KER_OFST 0x11B00 // Event queue read pointer register
+#else
+#define EVQ_RPTR_REG_KER_OFST 0x1B00 // Event queue read pointer register
+#endif
+
+#define EVQ_RPTR_REG_OFST 0xFA0000 // Event queue read pointer register array.
+  #define EVQ_RPTR_LBN 0
+  #define EVQ_RPTR_WIDTH 15
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define EVQ_PTR_TBL_KER_OFST 0x11A00 // Event queue pointer table for kernel access
+#else
+#define EVQ_PTR_TBL_KER_OFST 0x1A00 // Event queue pointer table for kernel access
+#endif
+
+#define EVQ_PTR_TBL_CHAR_OFST 0xF60000 // Event queue pointer table for char direct access
+  #define EVQ_WKUP_OR_INT_EN_LBN 39
+  #define EVQ_WKUP_OR_INT_EN_WIDTH 1
+  #define EVQ_NXT_WPTR_LBN 24
+  #define EVQ_NXT_WPTR_WIDTH 15
+  #define EVQ_EN_LBN 23
+  #define EVQ_EN_WIDTH 1
+  #define EVQ_SIZE_LBN 20
+  #define EVQ_SIZE_WIDTH 3
+  #define EVQ_BUF_BASE_ID_LBN 0
+  #define EVQ_BUF_BASE_ID_WIDTH 20
+#define TIMER_CMD_REG_KER_OFST 0x420 // Timer table for kernel access. Page-mapped
+#define TIMER_CMD_REG_PAGE4_OFST 0x8420 // Timer table for user-level access. Page-mapped. For lowest 1K queues.
+#define TIMER_CMD_REG_PAGE123K_OFST 0x1000420 // Timer table for user-level access. Page-mapped. For upper 3K queues.
+#define TIMER_TBL_OFST 0xF70000 // Timer table for char driver direct access
+  #define TIMER_MODE_LBN 12
+  #define TIMER_MODE_WIDTH 2
+  #define TIMER_VAL_LBN 0
+  #define TIMER_VAL_WIDTH 12
+  #define TIMER_MODE_INT_HLDOFF 2
+  #define EVQ_BUF_SIZE_LBN 0
+  #define EVQ_BUF_SIZE_WIDTH 1
+#define DRV_EV_REG_KER_OFST 0x440 // Driver generated event register
+#define DRV_EV_REG_OFST 0x440 // Driver generated event register
+  #define DRV_EV_QID_LBN 64
+  #define DRV_EV_QID_WIDTH 12
+  #define DRV_EV_DATA_LBN 0
+  #define DRV_EV_DATA_WIDTH 64
+#define EVQ_CTL_REG_KER_OFST 0x450 // Event queue control register
+#define EVQ_CTL_REG_OFST 0x450 // Event queue control register
+  #define RX_EVQ_WAKEUP_MASK_B0_LBN 15
+  #define RX_EVQ_WAKEUP_MASK_B0_WIDTH 6
+  #define EVQ_OWNERR_CTL_LBN 14
+  #define EVQ_OWNERR_CTL_WIDTH 1
+  #define EVQ_FIFO_AF_TH_LBN 8
+  #define EVQ_FIFO_AF_TH_WIDTH 6
+  #define EVQ_FIFO_NOTAF_TH_LBN 0
+  #define EVQ_FIFO_NOTAF_TH_WIDTH 6
+//////////////---- SRAM Module Registers C Header ----//////////////
+#define BUF_TBL_CFG_REG_KER_OFST 0x600 // Buffer table configuration register
+#define BUF_TBL_CFG_REG_OFST 0x600 // Buffer table configuration register
+  #define BUF_TBL_MODE_LBN 3
+  #define BUF_TBL_MODE_WIDTH 1
+#define SRM_RX_DC_CFG_REG_KER_OFST 0x610 // SRAM receive descriptor cache configuration register
+#define SRM_RX_DC_CFG_REG_OFST 0x610 // SRAM receive descriptor cache configuration register
+  #define SRM_RX_DC_BASE_ADR_LBN 0
+  #define SRM_RX_DC_BASE_ADR_WIDTH 21
+#define SRM_TX_DC_CFG_REG_KER_OFST 0x620 // SRAM transmit descriptor cache configuration register
+#define SRM_TX_DC_CFG_REG_OFST 0x620 // SRAM transmit descriptor cache configuration register
+  #define SRM_TX_DC_BASE_ADR_LBN 0
+  #define SRM_TX_DC_BASE_ADR_WIDTH 21
+#define SRM_CFG_REG_KER_OFST 0x630 // SRAM configuration register
+#define SRM_CFG_REG_OFST 0x630 // SRAM configuration register
+  #define SRAM_OOB_ADR_INTEN_LBN 5
+  #define SRAM_OOB_ADR_INTEN_WIDTH 1
+  #define SRAM_OOB_BUF_INTEN_LBN 4
+  #define SRAM_OOB_BUF_INTEN_WIDTH 1
+  #define SRAM_BT_INIT_EN_LBN 3
+  #define SRAM_BT_INIT_EN_WIDTH 1
+  #define SRM_NUM_BANK_LBN 2
+  #define SRM_NUM_BANK_WIDTH 1
+  #define SRM_BANK_SIZE_LBN 0
+  #define SRM_BANK_SIZE_WIDTH 2
+#define BUF_TBL_UPD_REG_KER_OFST 0x650 // Buffer table update register
+#define BUF_TBL_UPD_REG_OFST 0x650 // Buffer table update register
+  #define BUF_UPD_CMD_LBN 63
+  #define BUF_UPD_CMD_WIDTH 1
+  #define BUF_CLR_CMD_LBN 62
+  #define BUF_CLR_CMD_WIDTH 1
+  #define BUF_CLR_END_ID_LBN 32
+  #define BUF_CLR_END_ID_WIDTH 20
+  #define BUF_CLR_START_ID_LBN 0
+  #define BUF_CLR_START_ID_WIDTH 20
+#define SRM_UPD_EVQ_REG_KER_OFST 0x660 // Buffer table update register
+#define SRM_UPD_EVQ_REG_OFST 0x660 // Buffer table update register
+  #define SRM_UPD_EVQ_ID_LBN 0
+  #define SRM_UPD_EVQ_ID_WIDTH 12
+#define SRAM_PARITY_REG_KER_OFST 0x670 // SRAM parity register.
+#define SRAM_PARITY_REG_OFST 0x670 // SRAM parity register.
+  #define FORCE_SRAM_PERR_LBN 0
+  #define FORCE_SRAM_PERR_WIDTH 1
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define BUF_HALF_TBL_KER_OFST 0x18000 // Buffer table in half buffer table mode direct access by kernel driver
+#else
+#define BUF_HALF_TBL_KER_OFST 0x8000 // Buffer table in half buffer table mode direct access by kernel driver
+#endif
+
+
+#define BUF_HALF_TBL_OFST 0x800000 // Buffer table in half buffer table mode direct access by char driver
+  #define BUF_ADR_HBUF_ODD_LBN 44
+  #define BUF_ADR_HBUF_ODD_WIDTH 20
+  #define BUF_OWNER_ID_HBUF_ODD_LBN 32
+  #define BUF_OWNER_ID_HBUF_ODD_WIDTH 12
+  #define BUF_ADR_HBUF_EVEN_LBN 12
+  #define BUF_ADR_HBUF_EVEN_WIDTH 20
+  #define BUF_OWNER_ID_HBUF_EVEN_LBN 0
+  #define BUF_OWNER_ID_HBUF_EVEN_WIDTH 12
+
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define BUF_FULL_TBL_KER_OFST 0x18000 // Buffer table in full buffer table mode direct access by kernel driver
+#else
+#define BUF_FULL_TBL_KER_OFST 0x8000 // Buffer table in full buffer table mode direct access by kernel driver
+#endif
+
+
+
+
+#define BUF_FULL_TBL_OFST 0x800000 // Buffer table in full buffer table mode direct access by char driver
+  #define IP_DAT_BUF_SIZE_LBN 50
+  #define IP_DAT_BUF_SIZE_WIDTH 1
+  #define BUF_ADR_REGION_LBN 48
+  #define BUF_ADR_REGION_WIDTH 2
+  #define BUF_ADR_FBUF_LBN 14
+  #define BUF_ADR_FBUF_WIDTH 34
+  #define BUF_OWNER_ID_FBUF_LBN 0
+  #define BUF_OWNER_ID_FBUF_WIDTH 14
+#define SRM_DBG_REG_OFST 0x3000000 // SRAM debug access
+  #define SRM_DBG_LBN 0
+  #define SRM_DBG_WIDTH 64
+//////////////---- RX Datapath Registers C Header ----//////////////
+
+#define RX_CFG_REG_KER_OFST 0x800 // Receive configuration register
+#define RX_CFG_REG_OFST 0x800 // Receive configuration register
+
+#if !defined(FALCON_64K_RXFIFO) && !defined(FALCON_PRE_02020029)
+# if !defined(FALCON_128K_RXFIFO)
+#  define FALCON_128K_RXFIFO
+# endif
+#endif
+
+#if defined(FALCON_128K_RXFIFO)
+
+/* new for B0 */
+  #define RX_TOEP_TCP_SUPPRESS_B0_LBN 48
+  #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
+  #define RX_INGR_EN_B0_LBN 47
+  #define RX_INGR_EN_B0_WIDTH 1
+  #define RX_TOEP_IPV4_B0_LBN 46
+  #define RX_TOEP_IPV4_B0_WIDTH 1
+  #define RX_HASH_ALG_B0_LBN 45
+  #define RX_HASH_ALG_B0_WIDTH 1
+  #define RX_HASH_INSERT_HDR_B0_LBN 44
+  #define RX_HASH_INSERT_HDR_B0_WIDTH 1
+/* moved for B0 */
+  #define RX_DESC_PUSH_EN_B0_LBN 43
+  #define RX_DESC_PUSH_EN_B0_WIDTH 1
+  #define RX_RDW_PATCH_EN_LBN 42 /* Non head of line blocking */
+  #define RX_RDW_PATCH_EN_WIDTH 1
+  #define RX_PCI_BURST_SIZE_B0_LBN 39
+  #define RX_PCI_BURST_SIZE_B0_WIDTH 3
+  #define RX_OWNERR_CTL_B0_LBN 38
+  #define RX_OWNERR_CTL_B0_WIDTH 1
+  #define RX_XON_TX_TH_B0_LBN 33 
+  #define RX_XON_TX_TH_B0_WIDTH 5
+  #define RX_XOFF_TX_TH_B0_LBN 28 
+  #define RX_XOFF_TX_TH_B0_WIDTH 5
+  #define RX_USR_BUF_SIZE_B0_LBN 19
+  #define RX_USR_BUF_SIZE_B0_WIDTH 9
+  #define RX_XON_MAC_TH_B0_LBN 10
+  #define RX_XON_MAC_TH_B0_WIDTH 9
+  #define RX_XOFF_MAC_TH_B0_LBN 1
+  #define RX_XOFF_MAC_TH_B0_WIDTH 9
+  #define RX_XOFF_MAC_EN_B0_LBN 0
+  #define RX_XOFF_MAC_EN_B0_WIDTH 1
+
+#elif !defined(FALCON_PRE_02020029)
+/* new for B0 */
+  #define RX_TOEP_TCP_SUPPRESS_B0_LBN 46
+  #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
+  #define RX_INGR_EN_B0_LBN 45
+  #define RX_INGR_EN_B0_WIDTH 1
+  #define RX_TOEP_IPV4_B0_LBN 44
+  #define RX_TOEP_IPV4_B0_WIDTH 1
+  #define RX_HASH_ALG_B0_LBN 43
+  #define RX_HASH_ALG_B0_WIDTH 41
+  #define RX_HASH_INSERT_HDR_B0_LBN 42
+  #define RX_HASH_INSERT_HDR_B0_WIDTH 1
+/* moved for B0 */
+  #define RX_DESC_PUSH_EN_B0_LBN 41
+  #define RX_DESC_PUSH_EN_B0_WIDTH 1
+  #define RX_PCI_BURST_SIZE_B0_LBN 37
+  #define RX_PCI_BURST_SIZE_B0_WIDTH 3
+  #define RX_OWNERR_CTL_B0_LBN 36
+  #define RX_OWNERR_CTL_B0_WIDTH 1
+  #define RX_XON_TX_TH_B0_LBN 31
+  #define RX_XON_TX_TH_B0_WIDTH 5
+  #define RX_XOFF_TX_TH_B0_LBN 26
+  #define RX_XOFF_TX_TH_B0_WIDTH 5
+  #define RX_USR_BUF_SIZE_B0_LBN 17
+  #define RX_USR_BUF_SIZE_B0_WIDTH 9
+  #define RX_XON_MAC_TH_B0_LBN 9
+  #define RX_XON_MAC_TH_B0_WIDTH 8
+  #define RX_XOFF_MAC_TH_B0_LBN 1
+  #define RX_XOFF_MAC_TH_B0_WIDTH 8
+  #define RX_XOFF_MAC_EN_B0_LBN 0
+  #define RX_XOFF_MAC_EN_B0_WIDTH 1
+
+#else
+/* new for B0 */
+  #define RX_TOEP_TCP_SUPPRESS_B0_LBN 44
+  #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
+  #define RX_INGR_EN_B0_LBN 43
+  #define RX_INGR_EN_B0_WIDTH 1
+  #define RX_TOEP_IPV4_B0_LBN 42
+  #define RX_TOEP_IPV4_B0_WIDTH 1
+  #define RX_HASH_ALG_B0_LBN 41
+  #define RX_HASH_ALG_B0_WIDTH 41
+  #define RX_HASH_INSERT_HDR_B0_LBN 40
+  #define RX_HASH_INSERT_HDR_B0_WIDTH 1
+/* moved for B0 */
+  #define RX_DESC_PUSH_EN_B0_LBN 35
+  #define RX_DESC_PUSH_EN_B0_WIDTH 1
+  #define RX_PCI_BURST_SIZE_B0_LBN 35
+  #define RX_PCI_BURST_SIZE_B0_WIDTH 2
+  #define RX_OWNERR_CTL_B0_LBN 34
+  #define RX_OWNERR_CTL_B0_WIDTH 1
+  #define RX_XON_TX_TH_B0_LBN 29
+  #define RX_XON_TX_TH_B0_WIDTH 5
+  #define RX_XOFF_TX_TH_B0_LBN 24
+  #define RX_XOFF_TX_TH_B0_WIDTH 5
+  #define RX_USR_BUF_SIZE_B0_LBN 15
+  #define RX_USR_BUF_SIZE_B0_WIDTH 9
+  #define RX_XON_MAC_TH_B0_LBN 8
+  #define RX_XON_MAC_TH_B0_WIDTH 7
+  #define RX_XOFF_MAC_TH_B0_LBN 1
+  #define RX_XOFF_MAC_TH_B0_WIDTH 7
+  #define RX_XOFF_MAC_EN_B0_LBN 0
+  #define RX_XOFF_MAC_EN_B0_WIDTH 1
+
+#endif
+
+/* A0/A1 */
+  #define RX_PUSH_EN_A1_LBN 35
+  #define RX_PUSH_EN_A1_WIDTH 1
+  #define RX_PCI_BURST_SIZE_A1_LBN 31
+  #define RX_PCI_BURST_SIZE_A1_WIDTH 3
+  #define RX_OWNERR_CTL_A1_LBN 30
+  #define RX_OWNERR_CTL_A1_WIDTH 1
+  #define RX_XON_TX_TH_A1_LBN 25
+  #define RX_XON_TX_TH_A1_WIDTH 5
+  #define RX_XOFF_TX_TH_A1_LBN 20
+  #define RX_XOFF_TX_TH_A1_WIDTH 5
+  #define RX_USR_BUF_SIZE_A1_LBN 11
+  #define RX_USR_BUF_SIZE_A1_WIDTH 9
+  #define RX_XON_MAC_TH_A1_LBN 6
+  #define RX_XON_MAC_TH_A1_WIDTH 5
+  #define RX_XOFF_MAC_TH_A1_LBN 1
+  #define RX_XOFF_MAC_TH_A1_WIDTH 5
+  #define RX_XOFF_MAC_EN_A1_LBN 0
+  #define RX_XOFF_MAC_EN_A1_WIDTH 1
+
+#define RX_FILTER_CTL_REG_OFST 0x810 // Receive filter control registers
+  #define SCATTER_ENBL_NO_MATCH_Q_B0_LBN 40
+  #define SCATTER_ENBL_NO_MATCH_Q_B0_WIDTH 1
+  #define UDP_FULL_SRCH_LIMIT_LBN 32
+  #define UDP_FULL_SRCH_LIMIT_WIDTH 8
+  #define NUM_KER_LBN 24
+  #define NUM_KER_WIDTH 2
+  #define UDP_WILD_SRCH_LIMIT_LBN 16
+  #define UDP_WILD_SRCH_LIMIT_WIDTH 8
+  #define TCP_WILD_SRCH_LIMIT_LBN 8
+  #define TCP_WILD_SRCH_LIMIT_WIDTH 8
+  #define TCP_FULL_SRCH_LIMIT_LBN 0
+  #define TCP_FULL_SRCH_LIMIT_WIDTH 8
+#define RX_FLUSH_DESCQ_REG_KER_OFST 0x820 // Receive flush descriptor queue register
+#define RX_FLUSH_DESCQ_REG_OFST 0x820 // Receive flush descriptor queue register
+  #define RX_FLUSH_DESCQ_CMD_LBN 24
+  #define RX_FLUSH_DESCQ_CMD_WIDTH 1
+  #define RX_FLUSH_EVQ_ID_LBN 12
+  #define RX_FLUSH_EVQ_ID_WIDTH 12
+  #define RX_FLUSH_DESCQ_LBN 0
+  #define RX_FLUSH_DESCQ_WIDTH 12
+#define RX_DESC_UPD_REG_KER_OFST 0x830 // Kernel  receive descriptor update register. Page-mapped
+#define RX_DESC_UPD_REG_PAGE4_OFST 0x8830 // Char & user receive descriptor update register. Page-mapped. For lowest 1K queues.
+#define RX_DESC_UPD_REG_PAGE123K_OFST 0x1000830 // Char & user receive descriptor update register. Page-mapped. For upper 3K queues.
+  #define RX_DESC_WPTR_LBN 96
+  #define RX_DESC_WPTR_WIDTH 12
+  #define RX_DESC_PUSH_CMD_LBN 95
+  #define RX_DESC_PUSH_CMD_WIDTH 1
+  #define RX_DESC_LBN 0
+  #define RX_DESC_WIDTH 64
+  #define RX_KER_DESC_LBN 0
+  #define RX_KER_DESC_WIDTH 64
+  #define RX_USR_DESC_LBN 0
+  #define RX_USR_DESC_WIDTH 32
+#define RX_DC_CFG_REG_KER_OFST 0x840 // Receive descriptor cache configuration register
+#define RX_DC_CFG_REG_OFST 0x840 // Receive descriptor cache configuration register
+  #define RX_DC_SIZE_LBN 0
+  #define RX_DC_SIZE_WIDTH 2
+#define RX_DC_PF_WM_REG_KER_OFST 0x850 // Receive descriptor cache pre-fetch watermark register
+#define RX_DC_PF_WM_REG_OFST 0x850 // Receive descriptor cache pre-fetch watermark register
+  #define RX_DC_PF_LWM_LO_LBN 0
+  #define RX_DC_PF_LWM_LO_WIDTH 6
+
+#define RX_RSS_TKEY_B0_OFST 0x860 // RSS Toeplitz hash key (B0 only)
+
+#define RX_NODESC_DROP_REG 0x880
+  #define RX_NODESC_DROP_CNT_LBN 0
+  #define RX_NODESC_DROP_CNT_WIDTH 16
+
+#define XM_TX_CFG_REG_OFST 0x1230
+  #define XM_AUTO_PAD_LBN 5
+  #define XM_AUTO_PAD_WIDTH 1
+
+#define RX_FILTER_TBL0_OFST 0xF00000 // Receive filter table - even entries
+  #define RSS_EN_0_B0_LBN 110
+  #define RSS_EN_0_B0_WIDTH 1
+  #define SCATTER_EN_0_B0_LBN 109
+  #define SCATTER_EN_0_B0_WIDTH 1
+  #define TCP_UDP_0_LBN 108
+  #define TCP_UDP_0_WIDTH 1
+  #define RXQ_ID_0_LBN 96
+  #define RXQ_ID_0_WIDTH 12
+  #define DEST_IP_0_LBN 64
+  #define DEST_IP_0_WIDTH 32
+  #define DEST_PORT_TCP_0_LBN 48
+  #define DEST_PORT_TCP_0_WIDTH 16
+  #define SRC_IP_0_LBN 16
+  #define SRC_IP_0_WIDTH 32
+  #define SRC_TCP_DEST_UDP_0_LBN 0
+  #define SRC_TCP_DEST_UDP_0_WIDTH 16
+#define RX_FILTER_TBL1_OFST 0xF00010 // Receive filter table - odd entries
+  #define RSS_EN_1_B0_LBN 110
+  #define RSS_EN_1_B0_WIDTH 1
+  #define SCATTER_EN_1_B0_LBN 109
+  #define SCATTER_EN_1_B0_WIDTH 1
+  #define TCP_UDP_1_LBN 108
+  #define TCP_UDP_1_WIDTH 1
+  #define RXQ_ID_1_LBN 96
+  #define RXQ_ID_1_WIDTH 12
+  #define DEST_IP_1_LBN 64
+  #define DEST_IP_1_WIDTH 32
+  #define DEST_PORT_TCP_1_LBN 48
+  #define DEST_PORT_TCP_1_WIDTH 16
+  #define SRC_IP_1_LBN 16
+  #define SRC_IP_1_WIDTH 32
+  #define SRC_TCP_DEST_UDP_1_LBN 0
+  #define SRC_TCP_DEST_UDP_1_WIDTH 16
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define RX_DESC_PTR_TBL_KER_OFST 0x11800 // Receive descriptor pointer kernel access
+#else
+#define RX_DESC_PTR_TBL_KER_OFST 0x1800 // Receive descriptor pointer kernel access
+#endif
+
+
+#define RX_DESC_PTR_TBL_OFST 0xF40000 // Receive descriptor pointer table
+  #define RX_ISCSI_DDIG_EN_LBN 88
+  #define RX_ISCSI_DDIG_EN_WIDTH 1
+  #define RX_ISCSI_HDIG_EN_LBN 87
+  #define RX_ISCSI_HDIG_EN_WIDTH 1
+  #define RX_DESC_PREF_ACT_LBN 86
+  #define RX_DESC_PREF_ACT_WIDTH 1
+  #define RX_DC_HW_RPTR_LBN 80
+  #define RX_DC_HW_RPTR_WIDTH 6
+  #define RX_DESCQ_HW_RPTR_LBN 68
+  #define RX_DESCQ_HW_RPTR_WIDTH 12
+  #define RX_DESCQ_SW_WPTR_LBN 56
+  #define RX_DESCQ_SW_WPTR_WIDTH 12
+  #define RX_DESCQ_BUF_BASE_ID_LBN 36
+  #define RX_DESCQ_BUF_BASE_ID_WIDTH 20
+  #define RX_DESCQ_EVQ_ID_LBN 24
+  #define RX_DESCQ_EVQ_ID_WIDTH 12
+  #define RX_DESCQ_OWNER_ID_LBN 10
+  #define RX_DESCQ_OWNER_ID_WIDTH 14
+  #define RX_DESCQ_LABEL_LBN 5
+  #define RX_DESCQ_LABEL_WIDTH 5
+  #define RX_DESCQ_SIZE_LBN 3
+  #define RX_DESCQ_SIZE_WIDTH 2
+  #define RX_DESCQ_TYPE_LBN 2
+  #define RX_DESCQ_TYPE_WIDTH 1
+  #define RX_DESCQ_JUMBO_LBN 1
+  #define RX_DESCQ_JUMBO_WIDTH 1
+  #define RX_DESCQ_EN_LBN 0
+  #define RX_DESCQ_EN_WIDTH 1
+
+
+#define RX_RSS_INDIR_TBL_B0_OFST 0xFB0000 // RSS indirection table (B0 only)
+  #define RX_RSS_INDIR_ENT_B0_LBN 0
+  #define RX_RSS_INDIR_ENT_B0_WIDTH 6
+
+//////////////---- TX Datapath Registers C Header ----//////////////
+#define TX_FLUSH_DESCQ_REG_KER_OFST 0xA00 // Transmit flush descriptor queue register
+#define TX_FLUSH_DESCQ_REG_OFST 0xA00 // Transmit flush descriptor queue register
+  #define TX_FLUSH_DESCQ_CMD_LBN 12
+  #define TX_FLUSH_DESCQ_CMD_WIDTH 1
+  #define TX_FLUSH_DESCQ_LBN 0
+  #define TX_FLUSH_DESCQ_WIDTH 12
+#define TX_DESC_UPD_REG_KER_OFST 0xA10 // Kernel transmit descriptor update register. Page-mapped
+#define TX_DESC_UPD_REG_PAGE4_OFST 0x8A10 // Char & user transmit descriptor update register. Page-mapped
+#define TX_DESC_UPD_REG_PAGE123K_OFST 0x1000A10 // Char & user transmit descriptor update register. Page-mapped
+  #define TX_DESC_WPTR_LBN 96
+  #define TX_DESC_WPTR_WIDTH 12
+  #define TX_DESC_PUSH_CMD_LBN 95
+  #define TX_DESC_PUSH_CMD_WIDTH 1
+  #define TX_DESC_LBN 0
+  #define TX_DESC_WIDTH 95
+  #define TX_KER_DESC_LBN 0
+  #define TX_KER_DESC_WIDTH 64
+  #define TX_USR_DESC_LBN 0
+  #define TX_USR_DESC_WIDTH 64
+#define TX_DC_CFG_REG_KER_OFST 0xA20 // Transmit descriptor cache configuration register
+#define TX_DC_CFG_REG_OFST 0xA20 // Transmit descriptor cache configuration register
+  #define TX_DC_SIZE_LBN 0
+  #define TX_DC_SIZE_WIDTH 2
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define TX_DESC_PTR_TBL_KER_OFST 0x11900 // Transmit descriptor pointer.
+#else
+#define TX_DESC_PTR_TBL_KER_OFST 0x1900 // Transmit descriptor pointer.
+#endif
+
+
+#define TX_DESC_PTR_TBL_OFST 0xF50000 // Transmit descriptor pointer
+  #define TX_NON_IP_DROP_DIS_B0_LBN 91
+  #define TX_NON_IP_DROP_DIS_B0_WIDTH 1
+  #define TX_IP_CHKSM_DIS_B0_LBN 90
+  #define TX_IP_CHKSM_DIS_B0_WIDTH 1
+  #define TX_TCP_CHKSM_DIS_B0_LBN 89
+  #define TX_TCP_CHKSM_DIS_B0_WIDTH 1
+  #define TX_DESCQ_EN_LBN 88
+  #define TX_DESCQ_EN_WIDTH 1
+  #define TX_ISCSI_DDIG_EN_LBN 87
+  #define TX_ISCSI_DDIG_EN_WIDTH 1
+  #define TX_ISCSI_HDIG_EN_LBN 86
+  #define TX_ISCSI_HDIG_EN_WIDTH 1
+  #define TX_DC_HW_RPTR_LBN 80
+  #define TX_DC_HW_RPTR_WIDTH 6
+  #define TX_DESCQ_HW_RPTR_LBN 68
+  #define TX_DESCQ_HW_RPTR_WIDTH 12
+  #define TX_DESCQ_SW_WPTR_LBN 56
+  #define TX_DESCQ_SW_WPTR_WIDTH 12
+  #define TX_DESCQ_BUF_BASE_ID_LBN 36
+  #define TX_DESCQ_BUF_BASE_ID_WIDTH 20
+  #define TX_DESCQ_EVQ_ID_LBN 24
+  #define TX_DESCQ_EVQ_ID_WIDTH 12
+  #define TX_DESCQ_OWNER_ID_LBN 10
+  #define TX_DESCQ_OWNER_ID_WIDTH 14
+  #define TX_DESCQ_LABEL_LBN 5
+  #define TX_DESCQ_LABEL_WIDTH 5
+  #define TX_DESCQ_SIZE_LBN 3
+  #define TX_DESCQ_SIZE_WIDTH 2
+  #define TX_DESCQ_TYPE_LBN 1
+  #define TX_DESCQ_TYPE_WIDTH 2
+  #define TX_DESCQ_FLUSH_LBN 0
+  #define TX_DESCQ_FLUSH_WIDTH 1
+#define TX_CFG_REG_KER_OFST 0xA50 // Transmit configuration register
+#define TX_CFG_REG_OFST 0xA50 // Transmit configuration register
+  #define TX_IP_ID_P1_OFS_LBN 32
+  #define TX_IP_ID_P1_OFS_WIDTH 15
+  #define TX_IP_ID_P0_OFS_LBN 16
+  #define TX_IP_ID_P0_OFS_WIDTH 15
+  #define TX_TURBO_EN_LBN 3
+  #define TX_TURBO_EN_WIDTH 1 
+  #define TX_OWNERR_CTL_LBN 2
+  #define TX_OWNERR_CTL_WIDTH 2
+  #define TX_NON_IP_DROP_DIS_LBN 1
+  #define TX_NON_IP_DROP_DIS_WIDTH 1
+  #define TX_IP_ID_REP_EN_LBN 0
+  #define TX_IP_ID_REP_EN_WIDTH 1
+#define TX_RESERVED_REG_KER_OFST 0xA80 // Transmit configuration register
+#define TX_RESERVED_REG_OFST 0xA80 // Transmit configuration register
+  #define TX_CSR_PUSH_EN_LBN 89
+  #define TX_CSR_PUSH_EN_WIDTH 1
+  #define TX_RX_SPACER_LBN 64
+  #define TX_RX_SPACER_WIDTH 8
+  #define TX_SW_EV_EN_LBN 59
+  #define TX_SW_EV_EN_WIDTH 1
+  #define TX_RX_SPACER_EN_LBN 57
+  #define TX_RX_SPACER_EN_WIDTH 1
+  #define TX_CSR_PREF_WD_TMR_LBN 24
+  #define TX_CSR_PREF_WD_TMR_WIDTH 16
+  #define TX_CSR_ONLY1TAG_LBN 21
+  #define TX_CSR_ONLY1TAG_WIDTH 1
+  #define TX_PREF_THRESHOLD_LBN 19
+  #define TX_PREF_THRESHOLD_WIDTH 2
+  #define TX_ONE_PKT_PER_Q_LBN 18
+  #define TX_ONE_PKT_PER_Q_WIDTH 1
+  #define TX_DIS_NON_IP_EV_LBN 17
+  #define TX_DIS_NON_IP_EV_WIDTH 1
+  #define TX_DMA_SPACER_LBN 8
+  #define TX_DMA_SPACER_WIDTH 8
+  #define TX_FLUSH_MIN_LEN_EN_B0_LBN 7
+  #define TX_FLUSH_MIN_LEN_EN_B0_WIDTH 1
+  #define TX_TCP_DIS_A1_LBN 7
+  #define TX_TCP_DIS_A1_WIDTH 1
+  #define TX_IP_DIS_A1_LBN 6
+  #define TX_IP_DIS_A1_WIDTH 1
+  #define TX_MAX_CPL_LBN 2
+  #define TX_MAX_CPL_WIDTH 2
+  #define TX_MAX_PREF_LBN 0
+  #define TX_MAX_PREF_WIDTH 2
+#define TX_VLAN_REG_OFST 0xAE0 // Transmit VLAN tag register
+  #define TX_VLAN_EN_LBN 127
+  #define TX_VLAN_EN_WIDTH 1
+  #define TX_VLAN7_PORT1_EN_LBN 125
+  #define TX_VLAN7_PORT1_EN_WIDTH 1
+  #define TX_VLAN7_PORT0_EN_LBN 124
+  #define TX_VLAN7_PORT0_EN_WIDTH 1
+  #define TX_VLAN7_LBN 112
+  #define TX_VLAN7_WIDTH 12
+  #define TX_VLAN6_PORT1_EN_LBN 109
+  #define TX_VLAN6_PORT1_EN_WIDTH 1
+  #define TX_VLAN6_PORT0_EN_LBN 108
+  #define TX_VLAN6_PORT0_EN_WIDTH 1
+  #define TX_VLAN6_LBN 96
+  #define TX_VLAN6_WIDTH 12
+  #define TX_VLAN5_PORT1_EN_LBN 93
+  #define TX_VLAN5_PORT1_EN_WIDTH 1
+  #define TX_VLAN5_PORT0_EN_LBN 92
+  #define TX_VLAN5_PORT0_EN_WIDTH 1
+  #define TX_VLAN5_LBN 80
+  #define TX_VLAN5_WIDTH 12
+  #define TX_VLAN4_PORT1_EN_LBN 77
+  #define TX_VLAN4_PORT1_EN_WIDTH 1
+  #define TX_VLAN4_PORT0_EN_LBN 76
+  #define TX_VLAN4_PORT0_EN_WIDTH 1
+  #define TX_VLAN4_LBN 64
+  #define TX_VLAN4_WIDTH 12
+  #define TX_VLAN3_PORT1_EN_LBN 61
+  #define TX_VLAN3_PORT1_EN_WIDTH 1
+  #define TX_VLAN3_PORT0_EN_LBN 60
+  #define TX_VLAN3_PORT0_EN_WIDTH 1
+  #define TX_VLAN3_LBN 48
+  #define TX_VLAN3_WIDTH 12
+  #define TX_VLAN2_PORT1_EN_LBN 45
+  #define TX_VLAN2_PORT1_EN_WIDTH 1
+  #define TX_VLAN2_PORT0_EN_LBN 44
+  #define TX_VLAN2_PORT0_EN_WIDTH 1
+  #define TX_VLAN2_LBN 32
+  #define TX_VLAN2_WIDTH 12
+  #define TX_VLAN1_PORT1_EN_LBN 29
+  #define TX_VLAN1_PORT1_EN_WIDTH 1
+  #define TX_VLAN1_PORT0_EN_LBN 28
+  #define TX_VLAN1_PORT0_EN_WIDTH 1
+  #define TX_VLAN1_LBN 16
+  #define TX_VLAN1_WIDTH 12
+  #define TX_VLAN0_PORT1_EN_LBN 13
+  #define TX_VLAN0_PORT1_EN_WIDTH 1
+  #define TX_VLAN0_PORT0_EN_LBN 12
+  #define TX_VLAN0_PORT0_EN_WIDTH 1
+  #define TX_VLAN0_LBN 0
+  #define TX_VLAN0_WIDTH 12
+#define TX_FIL_CTL_REG_OFST 0xAF0 // Transmit filter control register
+  #define TX_MADR1_FIL_EN_LBN 65
+  #define TX_MADR1_FIL_EN_WIDTH 1
+  #define TX_MADR0_FIL_EN_LBN 64
+  #define TX_MADR0_FIL_EN_WIDTH 1
+  #define TX_IPFIL31_PORT1_EN_LBN 63
+  #define TX_IPFIL31_PORT1_EN_WIDTH 1
+  #define TX_IPFIL31_PORT0_EN_LBN 62
+  #define TX_IPFIL31_PORT0_EN_WIDTH 1
+  #define TX_IPFIL30_PORT1_EN_LBN 61
+  #define TX_IPFIL30_PORT1_EN_WIDTH 1
+  #define TX_IPFIL30_PORT0_EN_LBN 60
+  #define TX_IPFIL30_PORT0_EN_WIDTH 1
+  #define TX_IPFIL29_PORT1_EN_LBN 59
+  #define TX_IPFIL29_PORT1_EN_WIDTH 1
+  #define TX_IPFIL29_PORT0_EN_LBN 58
+  #define TX_IPFIL29_PORT0_EN_WIDTH 1
+  #define TX_IPFIL28_PORT1_EN_LBN 57
+  #define TX_IPFIL28_PORT1_EN_WIDTH 1
+  #define TX_IPFIL28_PORT0_EN_LBN 56
+  #define TX_IPFIL28_PORT0_EN_WIDTH 1
+  #define TX_IPFIL27_PORT1_EN_LBN 55
+  #define TX_IPFIL27_PORT1_EN_WIDTH 1
+  #define TX_IPFIL27_PORT0_EN_LBN 54
+  #define TX_IPFIL27_PORT0_EN_WIDTH 1
+  #define TX_IPFIL26_PORT1_EN_LBN 53
+  #define TX_IPFIL26_PORT1_EN_WIDTH 1
+  #define TX_IPFIL26_PORT0_EN_LBN 52
+  #define TX_IPFIL26_PORT0_EN_WIDTH 1
+  #define TX_IPFIL25_PORT1_EN_LBN 51
+  #define TX_IPFIL25_PORT1_EN_WIDTH 1
+  #define TX_IPFIL25_PORT0_EN_LBN 50
+  #define TX_IPFIL25_PORT0_EN_WIDTH 1
+  #define TX_IPFIL24_PORT1_EN_LBN 49
+  #define TX_IPFIL24_PORT1_EN_WIDTH 1
+  #define TX_IPFIL24_PORT0_EN_LBN 48
+  #define TX_IPFIL24_PORT0_EN_WIDTH 1
+  #define TX_IPFIL23_PORT1_EN_LBN 47
+  #define TX_IPFIL23_PORT1_EN_WIDTH 1
+  #define TX_IPFIL23_PORT0_EN_LBN 46
+  #define TX_IPFIL23_PORT0_EN_WIDTH 1
+  #define TX_IPFIL22_PORT1_EN_LBN 45
+  #define TX_IPFIL22_PORT1_EN_WIDTH 1
+  #define TX_IPFIL22_PORT0_EN_LBN 44
+  #define TX_IPFIL22_PORT0_EN_WIDTH 1
+  #define TX_IPFIL21_PORT1_EN_LBN 43
+  #define TX_IPFIL21_PORT1_EN_WIDTH 1
+  #define TX_IPFIL21_PORT0_EN_LBN 42
+  #define TX_IPFIL21_PORT0_EN_WIDTH 1
+  #define TX_IPFIL20_PORT1_EN_LBN 41
+  #define TX_IPFIL20_PORT1_EN_WIDTH 1
+  #define TX_IPFIL20_PORT0_EN_LBN 40
+  #define TX_IPFIL20_PORT0_EN_WIDTH 1
+  #define TX_IPFIL19_PORT1_EN_LBN 39
+  #define TX_IPFIL19_PORT1_EN_WIDTH 1
+  #define TX_IPFIL19_PORT0_EN_LBN 38
+  #define TX_IPFIL19_PORT0_EN_WIDTH 1
+  #define TX_IPFIL18_PORT1_EN_LBN 37
+  #define TX_IPFIL18_PORT1_EN_WIDTH 1
+  #define TX_IPFIL18_PORT0_EN_LBN 36
+  #define TX_IPFIL18_PORT0_EN_WIDTH 1
+  #define TX_IPFIL17_PORT1_EN_LBN 35
+  #define TX_IPFIL17_PORT1_EN_WIDTH 1
+  #define TX_IPFIL17_PORT0_EN_LBN 34
+  #define TX_IPFIL17_PORT0_EN_WIDTH 1
+  #define TX_IPFIL16_PORT1_EN_LBN 33
+  #define TX_IPFIL16_PORT1_EN_WIDTH 1
+  #define TX_IPFIL16_PORT0_EN_LBN 32
+  #define TX_IPFIL16_PORT0_EN_WIDTH 1
+  #define TX_IPFIL15_PORT1_EN_LBN 31
+  #define TX_IPFIL15_PORT1_EN_WIDTH 1
+  #define TX_IPFIL15_PORT0_EN_LBN 30
+  #define TX_IPFIL15_PORT0_EN_WIDTH 1
+  #define TX_IPFIL14_PORT1_EN_LBN 29
+  #define TX_IPFIL14_PORT1_EN_WIDTH 1
+  #define TX_IPFIL14_PORT0_EN_LBN 28
+  #define TX_IPFIL14_PORT0_EN_WIDTH 1
+  #define TX_IPFIL13_PORT1_EN_LBN 27
+  #define TX_IPFIL13_PORT1_EN_WIDTH 1
+  #define TX_IPFIL13_PORT0_EN_LBN 26
+  #define TX_IPFIL13_PORT0_EN_WIDTH 1
+  #define TX_IPFIL12_PORT1_EN_LBN 25
+  #define TX_IPFIL12_PORT1_EN_WIDTH 1
+  #define TX_IPFIL12_PORT0_EN_LBN 24
+  #define TX_IPFIL12_PORT0_EN_WIDTH 1
+  #define TX_IPFIL11_PORT1_EN_LBN 23
+  #define TX_IPFIL11_PORT1_EN_WIDTH 1
+  #define TX_IPFIL11_PORT0_EN_LBN 22
+  #define TX_IPFIL11_PORT0_EN_WIDTH 1
+  #define TX_IPFIL10_PORT1_EN_LBN 21
+  #define TX_IPFIL10_PORT1_EN_WIDTH 1
+  #define TX_IPFIL10_PORT0_EN_LBN 20
+  #define TX_IPFIL10_PORT0_EN_WIDTH 1
+  #define TX_IPFIL9_PORT1_EN_LBN 19
+  #define TX_IPFIL9_PORT1_EN_WIDTH 1
+  #define TX_IPFIL9_PORT0_EN_LBN 18
+  #define TX_IPFIL9_PORT0_EN_WIDTH 1
+  #define TX_IPFIL8_PORT1_EN_LBN 17
+  #define TX_IPFIL8_PORT1_EN_WIDTH 1
+  #define TX_IPFIL8_PORT0_EN_LBN 16
+  #define TX_IPFIL8_PORT0_EN_WIDTH 1
+  #define TX_IPFIL7_PORT1_EN_LBN 15
+  #define TX_IPFIL7_PORT1_EN_WIDTH 1
+  #define TX_IPFIL7_PORT0_EN_LBN 14
+  #define TX_IPFIL7_PORT0_EN_WIDTH 1
+  #define TX_IPFIL6_PORT1_EN_LBN 13
+  #define TX_IPFIL6_PORT1_EN_WIDTH 1
+  #define TX_IPFIL6_PORT0_EN_LBN 12
+  #define TX_IPFIL6_PORT0_EN_WIDTH 1
+  #define TX_IPFIL5_PORT1_EN_LBN 11
+  #define TX_IPFIL5_PORT1_EN_WIDTH 1
+  #define TX_IPFIL5_PORT0_EN_LBN 10
+  #define TX_IPFIL5_PORT0_EN_WIDTH 1
+  #define TX_IPFIL4_PORT1_EN_LBN 9
+  #define TX_IPFIL4_PORT1_EN_WIDTH 1
+  #define TX_IPFIL4_PORT0_EN_LBN 8
+  #define TX_IPFIL4_PORT0_EN_WIDTH 1
+  #define TX_IPFIL3_PORT1_EN_LBN 7
+  #define TX_IPFIL3_PORT1_EN_WIDTH 1
+  #define TX_IPFIL3_PORT0_EN_LBN 6
+  #define TX_IPFIL3_PORT0_EN_WIDTH 1
+  #define TX_IPFIL2_PORT1_EN_LBN 5
+  #define TX_IPFIL2_PORT1_EN_WIDTH 1
+  #define TX_IPFIL2_PORT0_EN_LBN 4
+  #define TX_IPFIL2_PORT0_EN_WIDTH 1
+  #define TX_IPFIL1_PORT1_EN_LBN 3
+  #define TX_IPFIL1_PORT1_EN_WIDTH 1
+  #define TX_IPFIL1_PORT0_EN_LBN 2
+  #define TX_IPFIL1_PORT0_EN_WIDTH 1
+  #define TX_IPFIL0_PORT1_EN_LBN 1
+  #define TX_IPFIL0_PORT1_EN_WIDTH 1
+  #define TX_IPFIL0_PORT0_EN_LBN 0
+  #define TX_IPFIL0_PORT0_EN_WIDTH 1
+#define TX_IPFIL_TBL_OFST 0xB00 // Transmit IP source address filter table
+  #define TX_IPFIL_MASK_LBN 32
+  #define TX_IPFIL_MASK_WIDTH 32
+  #define TX_IP_SRC_ADR_LBN 0
+  #define TX_IP_SRC_ADR_WIDTH 32
+#define TX_PACE_REG_A1_OFST 0xF80000 // Transmit pace control register
+#define TX_PACE_REG_B0_OFST 0xA90    // Transmit pace control register
+  #define TX_PACE_SB_AF_LBN 19
+  #define TX_PACE_SB_AF_WIDTH 10
+  #define TX_PACE_SB_NOTAF_LBN 9
+  #define TX_PACE_SB_NOTAF_WIDTH 10
+  #define TX_PACE_FB_BASE_LBN 5
+  #define TX_PACE_FB_BASE_WIDTH 4
+  #define TX_PACE_BIN_TH_LBN 0
+  #define TX_PACE_BIN_TH_WIDTH 5
+#define TX_PACE_TBL_A1_OFST 0xF80040 // Transmit pacing table
+#define TX_PACE_TBL_FIRST_QUEUE_A1 4
+#define TX_PACE_TBL_B0_OFST 0xF80000 // Transmit pacing table
+#define TX_PACE_TBL_FIRST_QUEUE_B0 0
+  #define TX_PACE_LBN 0
+  #define TX_PACE_WIDTH 5
+
+//////////////---- EE/Flash Registers C Header ----//////////////
+#define EE_SPI_HCMD_REG_KER_OFST 0x100 // SPI host command register
+#define EE_SPI_HCMD_REG_OFST 0x100 // SPI host command register
+  #define EE_SPI_HCMD_CMD_EN_LBN 31
+  #define EE_SPI_HCMD_CMD_EN_WIDTH 1
+  #define EE_WR_TIMER_ACTIVE_LBN 28
+  #define EE_WR_TIMER_ACTIVE_WIDTH 1
+  #define EE_SPI_HCMD_SF_SEL_LBN 24
+  #define EE_SPI_HCMD_SF_SEL_WIDTH 1
+  #define EE_SPI_HCMD_DABCNT_LBN 16
+  #define EE_SPI_HCMD_DABCNT_WIDTH 5
+  #define EE_SPI_HCMD_READ_LBN 15
+  #define EE_SPI_HCMD_READ_WIDTH 1
+  #define EE_SPI_HCMD_DUBCNT_LBN 12
+  #define EE_SPI_HCMD_DUBCNT_WIDTH 2
+  #define EE_SPI_HCMD_ADBCNT_LBN 8
+  #define EE_SPI_HCMD_ADBCNT_WIDTH 2
+  #define EE_SPI_HCMD_ENC_LBN 0
+  #define EE_SPI_HCMD_ENC_WIDTH 8
+#define EE_SPI_HADR_REG_KER_OFST 0X110 // SPI host address register
+#define EE_SPI_HADR_REG_OFST 0X110 // SPI host address register
+  #define EE_SPI_HADR_DUBYTE_LBN 24
+  #define EE_SPI_HADR_DUBYTE_WIDTH 8
+  #define EE_SPI_HADR_ADR_LBN 0
+  #define EE_SPI_HADR_ADR_WIDTH 24
+#define EE_SPI_HDATA_REG_KER_OFST 0x120 // SPI host data register
+#define EE_SPI_HDATA_REG_OFST 0x120 // SPI host data register
+  #define EE_SPI_HDATA3_LBN 96
+  #define EE_SPI_HDATA3_WIDTH 32
+  #define EE_SPI_HDATA2_LBN 64
+  #define EE_SPI_HDATA2_WIDTH 32
+  #define EE_SPI_HDATA1_LBN 32
+  #define EE_SPI_HDATA1_WIDTH 32
+  #define EE_SPI_HDATA0_LBN 0
+  #define EE_SPI_HDATA0_WIDTH 32
+#define EE_BASE_PAGE_REG_KER_OFST 0x130 // Expansion ROM base mirror register
+#define EE_BASE_PAGE_REG_OFST 0x130 // Expansion ROM base mirror register
+  #define EE_EXP_ROM_WINDOW_BASE_LBN 16
+  #define EE_EXP_ROM_WINDOW_BASE_WIDTH 13
+  #define EE_EXPROM_MASK_LBN 0
+  #define EE_EXPROM_MASK_WIDTH 13
+#define EE_VPD_CFG0_REG_KER_OFST 0X140 // SPI/VPD configuration register
+#define EE_VPD_CFG0_REG_OFST 0X140 // SPI/VPD configuration register
+  #define EE_SF_FASTRD_EN_LBN 127
+  #define EE_SF_FASTRD_EN_WIDTH 1
+  #define EE_SF_CLOCK_DIV_LBN 120
+  #define EE_SF_CLOCK_DIV_WIDTH 7
+  #define EE_VPD_WIP_POLL_LBN 119
+  #define EE_VPD_WIP_POLL_WIDTH 1
+  #define EE_VPDW_LENGTH_LBN 80
+  #define EE_VPDW_LENGTH_WIDTH 15
+  #define EE_VPDW_BASE_LBN 64
+  #define EE_VPDW_BASE_WIDTH 15
+  #define EE_VPD_WR_CMD_EN_LBN 56
+  #define EE_VPD_WR_CMD_EN_WIDTH 8
+  #define EE_VPD_BASE_LBN 32
+  #define EE_VPD_BASE_WIDTH 24
+  #define EE_VPD_LENGTH_LBN 16
+  #define EE_VPD_LENGTH_WIDTH 13
+  #define EE_VPD_AD_SIZE_LBN 8
+  #define EE_VPD_AD_SIZE_WIDTH 5
+  #define EE_VPD_ACCESS_ON_LBN 5
+  #define EE_VPD_ACCESS_ON_WIDTH 1
+#define EE_VPD_SW_CNTL_REG_KER_OFST 0X150 // VPD access SW control register
+#define EE_VPD_SW_CNTL_REG_OFST 0X150 // VPD access SW control register
+  #define EE_VPD_CYCLE_PENDING_LBN 31
+  #define EE_VPD_CYCLE_PENDING_WIDTH 1
+  #define EE_VPD_CYC_WRITE_LBN 28
+  #define EE_VPD_CYC_WRITE_WIDTH 1
+  #define EE_VPD_CYC_ADR_LBN 0
+  #define EE_VPD_CYC_ADR_WIDTH 15
+#define EE_VPD_SW_DATA_REG_KER_OFST 0x160 // VPD access SW data register
+#define EE_VPD_SW_DATA_REG_OFST 0x160 // VPD access SW data register
+  #define EE_VPD_CYC_DAT_LBN 0
+  #define EE_VPD_CYC_DAT_WIDTH 32
diff --git a/drivers/xen/sfc_netfront/ef_vi_falcon_desc.h b/drivers/xen/sfc_netfront/ef_vi_falcon_desc.h
new file mode 100644
index 0000000..8c8404c
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon_desc.h
@@ -0,0 +1,43 @@
+//////////////---- Descriptors C Headers ----//////////////
+// Receive Kernel IP Descriptor
+  #define RX_KER_BUF_SIZE_LBN 48
+  #define RX_KER_BUF_SIZE_WIDTH 14
+  #define RX_KER_BUF_REGION_LBN 46
+  #define RX_KER_BUF_REGION_WIDTH 2
+      #define RX_KER_BUF_REGION0_DECODE 0
+      #define RX_KER_BUF_REGION1_DECODE 1
+      #define RX_KER_BUF_REGION2_DECODE 2
+      #define RX_KER_BUF_REGION3_DECODE 3
+  #define RX_KER_BUF_ADR_LBN 0
+  #define RX_KER_BUF_ADR_WIDTH 46
+// Receive User IP Descriptor
+  #define RX_USR_2BYTE_OFS_LBN 20
+  #define RX_USR_2BYTE_OFS_WIDTH 12
+  #define RX_USR_BUF_ID_LBN 0
+  #define RX_USR_BUF_ID_WIDTH 20
+// Transmit Kernel IP Descriptor
+  #define TX_KER_PORT_LBN 63
+  #define TX_KER_PORT_WIDTH 1
+  #define TX_KER_CONT_LBN 62
+  #define TX_KER_CONT_WIDTH 1
+  #define TX_KER_BYTE_CNT_LBN 48
+  #define TX_KER_BYTE_CNT_WIDTH 14
+  #define TX_KER_BUF_REGION_LBN 46
+  #define TX_KER_BUF_REGION_WIDTH 2
+      #define TX_KER_BUF_REGION0_DECODE 0
+      #define TX_KER_BUF_REGION1_DECODE 1
+      #define TX_KER_BUF_REGION2_DECODE 2
+      #define TX_KER_BUF_REGION3_DECODE 3
+  #define TX_KER_BUF_ADR_LBN 0
+  #define TX_KER_BUF_ADR_WIDTH 46
+// Transmit User IP Descriptor
+  #define TX_USR_PORT_LBN 47
+  #define TX_USR_PORT_WIDTH 1
+  #define TX_USR_CONT_LBN 46
+  #define TX_USR_CONT_WIDTH 1
+  #define TX_USR_BYTE_CNT_LBN 33
+  #define TX_USR_BYTE_CNT_WIDTH 13
+  #define TX_USR_BUF_ID_LBN 13
+  #define TX_USR_BUF_ID_WIDTH 20
+  #define TX_USR_BYTE_OFS_LBN 0
+  #define TX_USR_BYTE_OFS_WIDTH 13
diff --git a/drivers/xen/sfc_netfront/ef_vi_falcon_event.h b/drivers/xen/sfc_netfront/ef_vi_falcon_event.h
new file mode 100644
index 0000000..abb63f3
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon_event.h
@@ -0,0 +1,123 @@
+//////////////---- Events Format C Header ----//////////////
+//////////////---- Event entry ----//////////////
+  #define EV_CODE_LBN 60
+  #define EV_CODE_WIDTH 4
+      #define RX_IP_EV_DECODE 0
+      #define TX_IP_EV_DECODE 2
+      #define DRIVER_EV_DECODE 5
+      #define GLOBAL_EV_DECODE 6
+      #define DRV_GEN_EV_DECODE 7
+  #define EV_DATA_LBN 0
+  #define EV_DATA_WIDTH 60
+//////////////---- Receive IP events for both Kernel & User event queues ----//////////////
+  #define RX_EV_PKT_OK_LBN 56
+  #define RX_EV_PKT_OK_WIDTH 1
+  #define RX_EV_BUF_OWNER_ID_ERR_LBN 54
+  #define RX_EV_BUF_OWNER_ID_ERR_WIDTH 1
+  #define RX_EV_IP_HDR_CHKSUM_ERR_LBN 52
+  #define RX_EV_IP_HDR_CHKSUM_ERR_WIDTH 1
+  #define RX_EV_TCP_UDP_CHKSUM_ERR_LBN 51
+  #define RX_EV_TCP_UDP_CHKSUM_ERR_WIDTH 1
+  #define RX_EV_ETH_CRC_ERR_LBN 50
+  #define RX_EV_ETH_CRC_ERR_WIDTH 1
+  #define RX_EV_FRM_TRUNC_LBN 49
+  #define RX_EV_FRM_TRUNC_WIDTH 1
+  #define RX_EV_DRIB_NIB_LBN 48
+  #define RX_EV_DRIB_NIB_WIDTH 1
+  #define RX_EV_TOBE_DISC_LBN 47
+  #define RX_EV_TOBE_DISC_WIDTH 1
+  #define RX_EV_PKT_TYPE_LBN 44
+  #define RX_EV_PKT_TYPE_WIDTH 3
+      #define RX_EV_PKT_TYPE_ETH_DECODE 0
+      #define RX_EV_PKT_TYPE_LLC_DECODE 1
+      #define RX_EV_PKT_TYPE_JUMBO_DECODE 2
+      #define RX_EV_PKT_TYPE_VLAN_DECODE 3
+      #define RX_EV_PKT_TYPE_VLAN_LLC_DECODE 4
+      #define RX_EV_PKT_TYPE_VLAN_JUMBO_DECODE 5
+  #define RX_EV_HDR_TYPE_LBN 42
+  #define RX_EV_HDR_TYPE_WIDTH 2
+      #define RX_EV_HDR_TYPE_TCP_IPV4_DECODE 0
+      #define RX_EV_HDR_TYPE_UDP_IPV4_DECODE 1
+      #define RX_EV_HDR_TYPE_OTHER_IP_DECODE 2
+      #define RX_EV_HDR_TYPE_NON_IP_DECODE 3
+  #define RX_EV_DESC_Q_EMPTY_LBN 41
+  #define RX_EV_DESC_Q_EMPTY_WIDTH 1
+  #define RX_EV_MCAST_HASH_MATCH_LBN 40
+  #define RX_EV_MCAST_HASH_MATCH_WIDTH 1
+  #define RX_EV_MCAST_PKT_LBN 39
+  #define RX_EV_MCAST_PKT_WIDTH 1
+  #define RX_EV_Q_LABEL_LBN 32
+  #define RX_EV_Q_LABEL_WIDTH 5
+  #define RX_JUMBO_CONT_LBN 31
+  #define RX_JUMBO_CONT_WIDTH 1
+  #define RX_SOP_LBN 15
+  #define RX_SOP_WIDTH 1
+  #define RX_PORT_LBN 30
+  #define RX_PORT_WIDTH 1
+  #define RX_EV_BYTE_CNT_LBN 16
+  #define RX_EV_BYTE_CNT_WIDTH 14
+  #define RX_iSCSI_PKT_OK_LBN 14
+  #define RX_iSCSI_PKT_OK_WIDTH 1
+  #define RX_ISCSI_DDIG_ERR_LBN 13
+  #define RX_ISCSI_DDIG_ERR_WIDTH 1
+  #define RX_ISCSI_HDIG_ERR_LBN 12
+  #define RX_ISCSI_HDIG_ERR_WIDTH 1
+  #define RX_EV_DESC_PTR_LBN 0
+  #define RX_EV_DESC_PTR_WIDTH 12
+//////////////---- Transmit IP events for both Kernel & User event queues ----//////////////
+  #define TX_EV_PKT_ERR_LBN 38
+  #define TX_EV_PKT_ERR_WIDTH 1
+  #define TX_EV_PKT_TOO_BIG_LBN 37
+  #define TX_EV_PKT_TOO_BIG_WIDTH 1
+  #define TX_EV_Q_LABEL_LBN 32
+  #define TX_EV_Q_LABEL_WIDTH 5
+  #define TX_EV_PORT_LBN 16
+  #define TX_EV_PORT_WIDTH 1
+  #define TX_EV_WQ_FF_FULL_LBN 15
+  #define TX_EV_WQ_FF_FULL_WIDTH 1
+  #define TX_EV_BUF_OWNER_ID_ERR_LBN 14
+  #define TX_EV_BUF_OWNER_ID_ERR_WIDTH 1
+  #define TX_EV_COMP_LBN 12
+  #define TX_EV_COMP_WIDTH 1
+  #define TX_EV_DESC_PTR_LBN 0
+  #define TX_EV_DESC_PTR_WIDTH 12
+//////////////---- Char or Kernel driver events ----//////////////
+  #define DRIVER_EV_SUB_CODE_LBN 56
+  #define DRIVER_EV_SUB_CODE_WIDTH 4
+      #define TX_DESCQ_FLS_DONE_EV_DECODE 0x0
+      #define RX_DESCQ_FLS_DONE_EV_DECODE 0x1
+      #define EVQ_INIT_DONE_EV_DECODE 0x2
+      #define EVQ_NOT_EN_EV_DECODE 0x3
+      #define RX_DESCQ_FLSFF_OVFL_EV_DECODE 0x4
+      #define SRM_UPD_DONE_EV_DECODE 0x5
+      #define WAKE_UP_EV_DECODE 0x6
+      #define TX_PKT_NON_TCP_UDP_DECODE 0x9
+      #define TIMER_EV_DECODE 0xA
+      #define RX_DSC_ERROR_EV_DECODE 0xE
+  #define DRIVER_EV_TX_DESCQ_ID_LBN 0
+  #define DRIVER_EV_TX_DESCQ_ID_WIDTH 12
+  #define DRIVER_EV_RX_DESCQ_ID_LBN 0
+  #define DRIVER_EV_RX_DESCQ_ID_WIDTH 12
+  #define DRIVER_EV_EVQ_ID_LBN 0
+  #define DRIVER_EV_EVQ_ID_WIDTH 12
+  #define DRIVER_TMR_ID_LBN 0
+  #define DRIVER_TMR_ID_WIDTH 12
+  #define DRIVER_EV_SRM_UPD_LBN 0
+  #define DRIVER_EV_SRM_UPD_WIDTH 2
+      #define SRM_CLR_EV_DECODE 0
+      #define SRM_UPD_EV_DECODE 1
+      #define SRM_ILLCLR_EV_DECODE 2
+//////////////---- Global events. Sent to both event queue 0 and 4. ----//////////////
+  #define XFP_PHY_INTR_LBN 10
+  #define XFP_PHY_INTR_WIDTH 1
+  #define XG_PHY_INTR_LBN 9
+  #define XG_PHY_INTR_WIDTH 1
+  #define G_PHY1_INTR_LBN 8
+  #define G_PHY1_INTR_WIDTH 1
+  #define G_PHY0_INTR_LBN 7
+  #define G_PHY0_INTR_WIDTH 1
+//////////////---- Driver generated events ----//////////////
+  #define DRV_GEN_EV_CODE_LBN 60
+  #define DRV_GEN_EV_CODE_WIDTH 4
+  #define DRV_GEN_EV_DATA_LBN 0
+  #define DRV_GEN_EV_DATA_WIDTH 60
diff --git a/drivers/xen/sfc_netfront/ef_vi_internal.h b/drivers/xen/sfc_netfront/ef_vi_internal.h
new file mode 100644
index 0000000..396ae46
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_internal.h
@@ -0,0 +1,256 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Really-and-truely-honestly internal stuff for libef.
+ *   \date  2004/06/13
+ */
+
+/*! \cidoxg_include_ci_ul */
+#ifndef __CI_EF_VI_INTERNAL_H__
+#define __CI_EF_VI_INTERNAL_H__
+
+
+/* These flags share space with enum ef_vi_flags. */
+#define EF_VI_BUG5692_WORKAROUND  0x10000
+
+
+/* ***********************************************************************
+ * COMPILATION CONTROL FLAGS (see ef_vi.h for "workaround" controls)
+ */
+
+#define EF_VI_DO_MAGIC_CHECKS 1
+
+
+/**********************************************************************
+ * Headers
+ */
+
+#include <etherfabric/ef_vi.h>
+#include "sysdep.h"
+#include "ef_vi_falcon.h"
+
+
+/**********************************************************************
+ * Debugging.
+ */
+
+#ifndef NDEBUG
+
+# define _ef_assert(exp, file, line) BUG_ON(!(exp));
+
+# define _ef_assert2(exp, x, y, file, line)  do {	\
+		if (unlikely(!(exp)))		\
+			BUG();				\
+	} while (0)
+
+#else
+
+# define _ef_assert(exp, file, line)
+# define _ef_assert2(e, x, y, file, line)
+
+#endif
+
+#define ef_assert(a)          do{ _ef_assert((a),__FILE__,__LINE__); } while(0)
+#define ef_assert_equal(a,b)  _ef_assert2((a)==(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_eq          ef_assert_equal
+#define ef_assert_lt(a,b)     _ef_assert2((a)<(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_le(a,b)     _ef_assert2((a)<=(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_nequal(a,b) _ef_assert2((a)!=(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_ne          ef_assert_nequal
+#define ef_assert_ge(a,b)     _ef_assert2((a)>=(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_gt(a,b)     _ef_assert2((a)>(b),(a),(b),__FILE__,__LINE__)
+
+/**********************************************************************
+ * Debug checks. ******************************************************
+ **********************************************************************/
+
+#ifdef NDEBUG
+# define EF_VI_MAGIC_SET(p, type)
+# define EF_VI_CHECK_VI(p)
+# define EF_VI_CHECK_EVENT_Q(p)
+# define EF_VI_CHECK_IOBUFSET(p)
+# define EF_VI_CHECK_FILTER(p)
+# define EF_VI_CHECK_SHMBUF(p)
+# define EF_VI_CHECK_PT_EP(p)
+#else
+# define EF_VI                    0x3
+# define EF_EPLOCK                0x6
+# define EF_IOBUFSET              0x9
+# define EF_FILTER                0xa
+# define EF_SHMBUF                0x11
+
+# define EF_VI_MAGIC(p, type)				\
+	(((unsigned)(type) << 28) |			\
+	 (((unsigned)(intptr_t)(p)) & 0x0fffffffu))
+
+# if !EF_VI_DO_MAGIC_CHECKS
+#  define EF_VI_MAGIC_SET(p, type)
+#  define EF_VI_MAGIC_CHECK(p, type)
+# else
+#  define EF_VI_MAGIC_SET(p, type)			\
+	do {						\
+		(p)->magic = EF_VI_MAGIC((p), (type));	\
+	} while (0)
+
+# define EF_VI_MAGIC_OKAY(p, type)                      \
+	((p)->magic == EF_VI_MAGIC((p), (type)))
+
+# define EF_VI_MAGIC_CHECK(p, type)                     \
+	ef_assert(EF_VI_MAGIC_OKAY((p), (type)))
+
+#endif /* EF_VI_DO_MAGIC_CHECKS */
+
+# define EF_VI_CHECK_VI(p)			\
+	ef_assert(p);				\
+	EF_VI_MAGIC_CHECK((p), EF_VI);
+
+# define EF_VI_CHECK_EVENT_Q(p)			\
+	ef_assert(p);				\
+	EF_VI_MAGIC_CHECK((p), EF_VI);		\
+	ef_assert((p)->evq_base);		\
+	ef_assert((p)->evq_mask);
+
+# define EF_VI_CHECK_PT_EP(p)			\
+	ef_assert(p);				\
+	EF_VI_MAGIC_CHECK((p), EF_VI);		\
+	ef_assert((p)->ep_state);
+
+# define EF_VI_CHECK_IOBUFSET(p)		\
+	ef_assert(p);				\
+	EF_VI_MAGIC_CHECK((p), EF_IOBUFSET)
+
+# define EF_VI_CHECK_FILTER(p)			\
+	ef_assert(p);				\
+	EF_VI_MAGIC_CHECK((p), EF_FILTER);
+
+# define EF_VI_CHECK_SHMBUF(p)			\
+	ef_assert(p);				\
+	EF_VI_MAGIC_CHECK((p), EF_SHMBUF);
+
+#endif
+
+#ifndef NDEBUG
+# define EF_DRIVER_MAGIC 0x00f00ba4
+# define EF_ASSERT_THIS_DRIVER_VALID(driver)				\
+	do{ ef_assert(driver);						\
+		EF_VI_MAGIC_CHECK((driver), EF_DRIVER_MAGIC);		\
+		ef_assert((driver)->init);               }while(0)
+
+# define EF_ASSERT_DRIVER_VALID() EF_ASSERT_THIS_DRIVER_VALID(&ci_driver)
+#else
+# define EF_ASSERT_THIS_DRIVER_VALID(driver)
+# define EF_ASSERT_DRIVER_VALID()
+#endif
+
+
+/* *************************************
+ * Power of 2 FIFO
+ */
+
+#define EF_VI_FIFO2_M(f, x)  ((x) & ((f)->fifo_mask))
+#define ef_vi_fifo2_valid(f) ((f) && (f)->fifo && (f)->fifo_mask > 0 &&	\
+			      (f)->fifo_rd_i <= (f)->fifo_mask       &&	\
+			      (f)->fifo_wr_i <= (f)->fifo_mask       &&	\
+			      EF_VI_IS_POW2((f)->fifo_mask+1u))
+
+#define ef_vi_fifo2_init(f, cap)			\
+	do{ ef_assert(EF_VI_IS_POW2((cap) + 1));	\
+		(f)->fifo_rd_i = (f)->fifo_wr_i = 0u;	\
+		(f)->fifo_mask = (cap);			\
+	}while(0)
+
+#define ef_vi_fifo2_is_empty(f) ((f)->fifo_rd_i == (f)->fifo_wr_i)
+#define ef_vi_fifo2_capacity(f) ((f)->fifo_mask)
+#define ef_vi_fifo2_buf_size(f) ((f)->fifo_mask + 1u)
+#define ef_vi_fifo2_end(f)      ((f)->fifo + ef_vi_fifo2_buf_size(f))
+#define ef_vi_fifo2_peek(f)     ((f)->fifo[(f)->fifo_rd_i])
+#define ef_vi_fifo2_poke(f)     ((f)->fifo[(f)->fifo_wr_i])
+#define ef_vi_fifo2_num(f)   EF_VI_FIFO2_M((f),(f)->fifo_wr_i-(f)->fifo_rd_i)
+
+#define ef_vi_fifo2_wr_prev(f)						\
+	do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i - 1u); }while(0)
+#define ef_vi_fifo2_wr_next(f)						\
+	do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i + 1u); }while(0)
+#define ef_vi_fifo2_rd_adv(f, n)					\
+	do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + (n)); }while(0)
+#define ef_vi_fifo2_rd_prev(f)						\
+	do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i - 1u); }while(0)
+#define ef_vi_fifo2_rd_next(f)						\
+	do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + 1u); }while(0)
+
+#define ef_vi_fifo2_put(f, v)						\
+	do{ ef_vi_fifo2_poke(f) = (v); ef_vi_fifo2_wr_next(f); }while(0)
+#define ef_vi_fifo2_get(f, pv)						\
+	do{ *(pv) = ef_vi_fifo2_peek(f); ef_vi_fifo2_rd_next(f); }while(0)
+
+
+/* *********************************************************************
+ * Eventq handling
+ */
+
+typedef union {
+	uint64_t    u64;
+	struct {
+		uint32_t  a;
+		uint32_t  b;
+	} opaque;
+} ef_vi_event;
+
+
+#define EF_VI_EVENT_OFFSET(q, i)					\
+	(((q)->evq_state->evq_ptr - (i) * sizeof(ef_vi_event)) & (q)->evq_mask)
+
+#define EF_VI_EVENT_PTR(q, i)                                           \
+	((ef_vi_event*) ((q)->evq_base + EF_VI_EVENT_OFFSET((q), (i))))
+
+/* *********************************************************************
+ * Miscellaneous goodies
+ */
+#ifdef NDEBUG
+# define EF_VI_DEBUG(x)
+#else
+# define EF_VI_DEBUG(x)            x
+#endif
+
+#define EF_VI_ROUND_UP(i, align)   (((i)+(align)-1u) & ~((align)-1u))
+#define EF_VI_ALIGN_FWD(p, align)  (((p)+(align)-1u) & ~((align)-1u))
+#define EF_VI_ALIGN_BACK(p, align) ((p) & ~((align)-1u))
+#define EF_VI_PTR_ALIGN_BACK(p, align)					\
+	((char*)EF_VI_ALIGN_BACK(((intptr_t)(p)), ((intptr_t)(align))))
+#define EF_VI_IS_POW2(x)           ((x) && ! ((x) & ((x) - 1)))
+
+
+/* ******************************************************************** 
+ */
+
+extern void falcon_vi_init(ef_vi*, void* vvis ) EF_VI_HF;
+extern void ef_eventq_state_init(ef_vi* evq) EF_VI_HF;
+extern void __ef_init(void) EF_VI_HF;
+
+
+#endif  /* __CI_EF_VI_INTERNAL_H__ */
+
diff --git a/drivers/xen/sfc_netfront/etherfabric/ef_vi.h b/drivers/xen/sfc_netfront/etherfabric/ef_vi.h
new file mode 100644
index 0000000..6b1bef0
--- /dev/null
+++ b/drivers/xen/sfc_netfront/etherfabric/ef_vi.h
@@ -0,0 +1,647 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ *  \brief  Virtual Interface
+ *   \date  2007/05/16
+ */
+
+#ifndef __EFAB_EF_VI_H__
+#define __EFAB_EF_VI_H__
+
+
+/**********************************************************************
+ * Primitive types ****************************************************
+ **********************************************************************/
+
+/* We standardise on the types from stdint.h and synthesise these types
+ * for compilers/platforms that don't provide them */
+
+#  include <linux/types.h>
+# define EF_VI_ALIGN(x) __attribute__ ((aligned (x)))
+# define ef_vi_inline static inline
+
+
+
+/**********************************************************************
+ * Types **************************************************************
+ **********************************************************************/
+
+typedef uint32_t                ef_eventq_ptr;
+
+typedef uint64_t                ef_addr;
+typedef char*                   ef_vi_ioaddr_t;
+
+/**********************************************************************
+ * ef_event ***********************************************************
+ **********************************************************************/
+
+/*! \i_ef_vi A DMA request identifier.
+**
+** This is an integer token specified by the transport and associated
+** with a DMA request.  It is returned to the VI user with DMA completion
+** events.  It is typically used to identify the buffer associated with
+** the transfer.
+*/
+typedef int			ef_request_id;
+
+typedef union {
+	uint64_t  u64[1];
+	uint32_t  u32[2];
+} ef_vi_qword;
+
+typedef ef_vi_qword             ef_hw_event;
+
+#define EF_REQUEST_ID_BITS      16u
+#define EF_REQUEST_ID_MASK      ((1u << EF_REQUEST_ID_BITS) - 1u)
+
+/*! \i_ef_event An [ef_event] is a token that identifies something that
+** has happened.  Examples include packets received, packets transmitted
+** and errors.
+*/
+typedef union {
+	struct {
+		ef_hw_event    ev;
+		unsigned       type       :16;
+	} generic;
+	struct {
+		ef_hw_event    ev;
+		unsigned       type       :16;
+		/*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
+		unsigned       q_id       :16;
+		unsigned       len        :16;
+		unsigned       flags      :16;
+	} rx;
+	struct {  /* This *must* have same layout as [rx]. */
+		ef_hw_event    ev;
+		unsigned       type       :16;
+		/*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
+		unsigned       q_id       :16;
+		unsigned       len        :16;
+		unsigned       flags      :16;
+		unsigned       subtype    :16;
+	} rx_discard;
+	struct {
+		ef_hw_event    ev;
+		unsigned       type       :16;
+		/*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
+		unsigned       q_id       :16;
+	} tx;
+	struct {
+		ef_hw_event    ev;
+		unsigned       type       :16;
+		/*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
+		unsigned       q_id       :16;
+		unsigned       subtype    :16;
+	} tx_error;
+	struct {
+		ef_hw_event    ev;
+		unsigned       type       :16;
+		unsigned       q_id       :16;
+	} rx_no_desc_trunc;
+	struct {
+		ef_hw_event    ev;
+		unsigned       type       :16;
+		unsigned       data;
+	} sw;
+} ef_event;
+
+
+#define EF_EVENT_TYPE(e)        ((e).generic.type)
+enum {
+	/** Good data was received. */
+	EF_EVENT_TYPE_RX,
+	/** Packets have been sent. */
+	EF_EVENT_TYPE_TX,
+	/** Data received and buffer consumed, but something is wrong. */
+	EF_EVENT_TYPE_RX_DISCARD,
+	/** Transmit of packet failed. */
+	EF_EVENT_TYPE_TX_ERROR,
+	/** Received packet was truncated due to lack of descriptors. */
+	EF_EVENT_TYPE_RX_NO_DESC_TRUNC,
+	/** Software generated event. */
+	EF_EVENT_TYPE_SW,
+	/** Event queue overflow. */
+	EF_EVENT_TYPE_OFLOW,
+};
+
+#define EF_EVENT_RX_BYTES(e)    ((e).rx.len)
+#define EF_EVENT_RX_Q_ID(e)     ((e).rx.q_id)
+#define EF_EVENT_RX_CONT(e)     ((e).rx.flags & EF_EVENT_FLAG_CONT)
+#define EF_EVENT_RX_SOP(e)      ((e).rx.flags & EF_EVENT_FLAG_SOP)
+#define EF_EVENT_RX_ISCSI_OKAY(e) ((e).rx.flags & EF_EVENT_FLAG_ISCSI_OK)
+#define EF_EVENT_FLAG_SOP       0x1
+#define EF_EVENT_FLAG_CONT      0x2
+#define EF_EVENT_FLAG_ISCSI_OK  0x4
+
+#define EF_EVENT_TX_Q_ID(e)     ((e).tx.q_id)
+
+#define EF_EVENT_RX_DISCARD_Q_ID(e)  ((e).rx_discard.q_id)
+#define EF_EVENT_RX_DISCARD_LEN(e)   ((e).rx_discard.len)
+#define EF_EVENT_RX_DISCARD_TYPE(e)  ((e).rx_discard.subtype)
+enum {
+	EF_EVENT_RX_DISCARD_CSUM_BAD,
+	EF_EVENT_RX_DISCARD_CRC_BAD,
+	EF_EVENT_RX_DISCARD_TRUNC,
+	EF_EVENT_RX_DISCARD_RIGHTS,
+	EF_EVENT_RX_DISCARD_OTHER,
+};
+
+#define EF_EVENT_TX_ERROR_Q_ID(e)    ((e).tx_error.q_id)
+#define EF_EVENT_TX_ERROR_TYPE(e)    ((e).tx_error.subtype)
+enum {
+	EF_EVENT_TX_ERROR_RIGHTS,
+	EF_EVENT_TX_ERROR_OFLOW,
+	EF_EVENT_TX_ERROR_2BIG,
+	EF_EVENT_TX_ERROR_BUS,
+};
+
+#define EF_EVENT_RX_NO_DESC_TRUNC_Q_ID(e)  ((e).rx_no_desc_trunc.q_id)
+
+#define EF_EVENT_SW_DATA_MASK   0xffff
+#define EF_EVENT_SW_DATA(e)     ((e).sw.data)
+
+#define EF_EVENT_FMT            "[ev:%x:%08x:%08x]"
+#define EF_EVENT_PRI_ARG(e)     (unsigned) (e).generic.type,    \
+		(unsigned) (e).generic.ev.u32[1],		\
+		(unsigned) (e).generic.ev.u32[0]
+
+#define EF_GET_HW_EV(e)         ((e).generic.ev)
+#define EF_GET_HW_EV_PTR(e)     (&(e).generic.ev)
+#define EF_GET_HW_EV_U64(e)     ((e).generic.ev.u64[0])
+
+
+/* ***************** */
+
+/*! Used by netif shared state. Must use types of explicit size. */
+typedef struct {
+	uint16_t              rx_last_desc_ptr;   /* for RX duplicates       */
+	uint8_t               bad_sop;            /* bad SOP detected        */
+	uint8_t               frag_num;           /* next fragment #, 0=>SOP */
+} ef_rx_dup_state_t;
+
+
+/* Max number of ports on any SF NIC. */
+#define EFAB_DMAQS_PER_EVQ_MAX 32
+
+typedef struct {
+	ef_eventq_ptr	        evq_ptr;
+	int32_t               trashed;
+	ef_rx_dup_state_t     rx_dup_state[EFAB_DMAQS_PER_EVQ_MAX];
+} ef_eventq_state;
+
+
+/*! \i_ef_base [ef_iovec] is similar the standard [struct iovec].  An
+** array of these is used to designate a scatter/gather list of I/O
+** buffers.
+*/
+typedef struct {
+	ef_addr                       iov_base EF_VI_ALIGN(8);
+	unsigned                      iov_len;
+} ef_iovec;
+
+/* Falcon constants */
+#define TX_EV_DESC_PTR_LBN 0
+
+
+/**********************************************************************
+ * ef_vi **************************************************************
+ **********************************************************************/
+
+enum ef_vi_flags {
+	EF_VI_RX_SCATTER        = 0x1,
+	EF_VI_ISCSI_RX_HDIG     = 0x2,
+	EF_VI_ISCSI_TX_HDIG     = 0x4,
+	EF_VI_ISCSI_RX_DDIG     = 0x8,
+	EF_VI_ISCSI_TX_DDIG     = 0x10,
+	EF_VI_TX_PHYS_ADDR      = 0x20,
+	EF_VI_RX_PHYS_ADDR      = 0x40,
+	EF_VI_TX_IP_CSUM_DIS    = 0x80,
+	EF_VI_TX_TCPUDP_CSUM_DIS= 0x100,
+	EF_VI_TX_TCPUDP_ONLY    = 0x200,
+	/* Flags in range 0xXXXX0000 are for internal use. */
+};
+
+typedef struct {
+	uint32_t  added;
+	uint32_t  removed;
+} ef_vi_txq_state;
+
+typedef struct {
+	uint32_t  added;
+	uint32_t  removed;
+} ef_vi_rxq_state;
+
+typedef struct {
+	uint32_t         mask;
+	void*            doorbell;
+	void*            descriptors;
+	uint16_t*        ids;
+	unsigned         misalign_mask;
+} ef_vi_txq;
+
+typedef struct {
+	uint32_t         mask;
+	void*            doorbell;
+	void*            descriptors;
+	uint16_t*        ids;
+} ef_vi_rxq;
+
+typedef struct {
+	ef_eventq_state  evq;
+	ef_vi_txq_state  txq;
+	ef_vi_rxq_state  rxq;
+	/* Followed by request id fifos. */
+} ef_vi_state;
+
+/*! \i_ef_vi  A virtual interface.
+**
+** An [ef_vi] represents a virtual interface on a specific NIC.  A
+** virtual interface is a collection of an event queue and two DMA queues
+** used to pass Ethernet frames between the transport implementation and
+** the network.
+*/
+typedef struct ef_vi {
+	unsigned			magic;
+
+	unsigned                      vi_resource_id;
+	unsigned                      vi_resource_handle_hack;
+	unsigned                      vi_i;
+
+	char*				vi_mem_mmap_ptr;
+	int                           vi_mem_mmap_bytes;
+	char*				vi_io_mmap_ptr;
+	int                           vi_io_mmap_bytes;
+
+	ef_eventq_state*              evq_state;
+	char*                         evq_base;
+	unsigned                      evq_mask;
+	ef_vi_ioaddr_t                evq_timer_reg;
+
+	ef_vi_txq                     vi_txq;
+	ef_vi_rxq                     vi_rxq;
+	ef_vi_state*                  ep_state;
+	enum ef_vi_flags              vi_flags;
+} ef_vi;
+
+
+enum ef_vi_arch {
+	EF_VI_ARCH_FALCON,
+};
+
+
+struct ef_vi_nic_type {
+	unsigned char  arch;
+	char           variant;
+	unsigned char  revision;
+};
+
+
+/* This structure is opaque to the client & used to pass mapping data
+ * from the resource manager to the ef_vi lib. for ef_vi_init().
+ */
+struct vi_mappings {
+	uint32_t         signature;
+# define VI_MAPPING_VERSION   0x02  /*Byte: Increment me if struct altered*/
+# define VI_MAPPING_SIGNATURE (0xBA1150 + VI_MAPPING_VERSION)
+
+	struct ef_vi_nic_type nic_type;
+
+	int              vi_instance;
+
+	unsigned         evq_bytes;
+	char*            evq_base;
+	ef_vi_ioaddr_t   evq_timer_reg;
+
+	unsigned         rx_queue_capacity;
+	ef_vi_ioaddr_t   rx_dma_ef1;
+	char*            rx_dma_falcon;
+	ef_vi_ioaddr_t   rx_bell;
+
+	unsigned         tx_queue_capacity;
+	ef_vi_ioaddr_t   tx_dma_ef1;
+	char*            tx_dma_falcon;
+	ef_vi_ioaddr_t   tx_bell;
+};
+/* This is used by clients to allocate a suitably sized buffer for the 
+ * resource manager to fill & ef_vi_init() to use. */
+#define VI_MAPPINGS_SIZE (sizeof(struct vi_mappings))
+
+
+/**********************************************************************
+ * ef_config **********************************************************
+ **********************************************************************/
+
+struct ef_config_t {
+	int   log;                    /* debug logging level          */
+};
+
+extern struct ef_config_t  ef_config;
+
+
+/**********************************************************************
+ * ef_vi **************************************************************
+ **********************************************************************/
+
+/* Initialise [data_area] with information required to initialise an ef_vi.
+ * In the following, an unused param should be set to NULL. Note the case
+ * marked (*) of [iobuf_mmap] for falcon/driver; for normal driver this
+ * must be NULL.
+ *
+ * \param  data_area     [in,out] required, must ref at least VI_MAPPINGS_SIZE 
+ *                                bytes
+ * \param  evq_capacity  [in] number of events in event queue.  Specify 0 for
+ *                            no event queue.
+ * \param  rxq_capacity  [in] number of descriptors in RX DMA queue.  Specify
+ *                            0 for no RX queue.
+ * \param  txq_capacity  [in] number of descriptors in TX DMA queue.  Specify
+ *                            0 for no TX queue.
+ * \param  mmap_info     [in] mem-map info for resource
+ * \param  io_mmap       [in] ef1,    required
+ *                            falcon, required
+ * \param  iobuf_mmap    [in] ef1,    UL: unused
+ *                            falcon, UL: required
+ */
+extern void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type,
+                                  unsigned rxq_capacity,
+                                  unsigned txq_capacity, int instance,
+                                  void* io_mmap, void* iobuf_mmap_rx,
+                                  void* iobuf_mmap_tx, enum ef_vi_flags);
+
+
+extern void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type,
+                                   int instance, unsigned evq_bytes,
+                                   void* base, void* timer_reg);
+
+ef_vi_inline unsigned ef_vi_resource_id(ef_vi* vi)
+{ 
+	return vi->vi_resource_id; 
+}
+
+ef_vi_inline enum ef_vi_flags ef_vi_flags(ef_vi* vi)
+{ 
+	return vi->vi_flags; 
+}
+
+
+/**********************************************************************
+ * Receive interface **************************************************
+ **********************************************************************/
+
+/*! \i_ef_vi Returns the amount of space in the RX descriptor ring.
+**
+** \return the amount of space in the queue.
+*/
+ef_vi_inline int ef_vi_receive_space(ef_vi* vi) 
+{
+	ef_vi_rxq_state* qs = &vi->ep_state->rxq;
+	return vi->vi_rxq.mask - (qs->added - qs->removed);
+}
+
+
+/*! \i_ef_vi Returns the fill level of the RX descriptor ring.
+**
+** \return the fill level of the queue.
+*/
+ef_vi_inline int ef_vi_receive_fill_level(ef_vi* vi) 
+{
+	ef_vi_rxq_state* qs = &vi->ep_state->rxq;
+	return qs->added - qs->removed;
+}
+
+
+ef_vi_inline int ef_vi_receive_capacity(ef_vi* vi)
+{ 
+	return vi->vi_rxq.mask;
+}
+
+/*! \i_ef_vi  Complete a receive operation.
+**
+** When a receive completion event is received, it should be passed to
+** this function.  The request-id for the buffer that the packet was
+** delivered to is returned.
+**
+** After this function returns, more space may be available in the
+** receive queue.
+*/
+extern ef_request_id ef_vi_receive_done(const ef_vi*, const ef_event*);
+
+/*! \i_ef_vi  Return request ID indicated by a receive event
+ */
+ef_vi_inline ef_request_id ef_vi_receive_request_id(const ef_vi* vi,
+                                                    const ef_event* ef_ev)
+{
+	const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev);
+	return ev->u32[0] & vi->vi_rxq.mask;
+}
+  
+
+/*! \i_ef_vi  Form a receive descriptor.
+**
+** If \c initial_rx_bytes is zero use a reception size at least as large
+** as an MTU.
+*/
+extern int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id,
+                              int intial_rx_bytes);
+
+/*! \i_ef_vi  Submit initialised receive descriptors to the NIC. */
+extern void ef_vi_receive_push(ef_vi* vi);
+
+/*! \i_ef_vi  Post a buffer on the receive queue.
+**
+**   \return 0 on success, or -EAGAIN if the receive queue is full
+*/
+extern int ef_vi_receive_post(ef_vi*, ef_addr addr,
+			      ef_request_id dma_id);
+
+/**********************************************************************
+ * Transmit interface *************************************************
+ **********************************************************************/
+
+/*! \i_ef_vi Return the amount of space (in descriptors) in the transmit
+**           queue.
+**
+** \return the amount of space in the queue (in descriptors)
+*/
+ef_vi_inline int ef_vi_transmit_space(ef_vi* vi) 
+{
+	ef_vi_txq_state* qs = &vi->ep_state->txq;
+	return vi->vi_txq.mask - (qs->added - qs->removed);
+}
+
+
+/*! \i_ef_vi Returns the fill level of the TX descriptor ring.
+**
+** \return the fill level of the queue.
+*/
+ef_vi_inline int ef_vi_transmit_fill_level(ef_vi* vi)
+{
+	ef_vi_txq_state* qs = &vi->ep_state->txq;
+	return qs->added - qs->removed;
+}
+
+
+/*! \i_ef_vi Returns the total capacity of the TX descriptor ring.
+**
+** \return the capacity of the queue.
+*/
+ef_vi_inline int ef_vi_transmit_capacity(ef_vi* vi)
+{ 
+	return vi->vi_txq.mask;
+}
+
+
+/*! \i_ef_vi  Transmit a packet.
+**
+**   \param bytes must be greater than ETH_ZLEN.
+**   \return -EAGAIN if the transmit queue is full, or 0 on success
+*/
+extern int ef_vi_transmit(ef_vi*, ef_addr, int bytes, ef_request_id dma_id);
+
+/*! \i_ef_vi  Transmit a packet using a gather list.
+**
+**   \param iov_len must be greater than zero
+**   \param iov the first must be non-zero in length (but others need not)
+**
+**   \return -EAGAIN if the queue is full, or 0 on success
+*/
+extern int ef_vi_transmitv(ef_vi*, const ef_iovec* iov, int iov_len,
+                           ef_request_id dma_id);
+
+/*! \i_ef_vi  Initialise a DMA request.
+**
+** \return -EAGAIN if the queue is full, or 0 on success
+*/
+extern int ef_vi_transmit_init(ef_vi*, ef_addr, int bytes,
+                               ef_request_id dma_id);
+
+/*! \i_ef_vi  Initialise a DMA request.
+**
+** \return -EAGAIN if the queue is full, or 0 on success
+*/
+extern int ef_vi_transmitv_init(ef_vi*, const ef_iovec*, int iov_len,
+                                ef_request_id dma_id);
+
+/*! \i_ef_vi  Submit DMA requests to the NIC.
+**
+** The DMA requests must have been initialised using
+** ef_vi_transmit_init() or ef_vi_transmitv_init().
+*/
+extern void ef_vi_transmit_push(ef_vi*);
+
+
+/*! \i_ef_vi Maximum number of transmit completions per transmit event. */
+#define EF_VI_TRANSMIT_BATCH  64
+
+/*! \i_ef_vi Determine the set of [ef_request_id]s for each DMA request
+**           which has been completed by a given transmit completion
+**           event.
+**
+** \param ids must point to an array of length EF_VI_TRANSMIT_BATCH
+** \return the number of valid [ef_request_id]s (can be zero)
+*/
+extern int ef_vi_transmit_unbundle(ef_vi* ep, const ef_event*,
+                                   ef_request_id* ids);
+
+
+/*! \i_ef_event Returns true if ef_eventq_poll() will return event(s). */
+extern int ef_eventq_has_event(ef_vi* vi);
+
+/*! \i_ef_event Returns true if there are quite a few events in the event
+** queue.
+**
+** This looks ahead in the event queue, so has the property that it will
+** not ping-pong a cache-line when it is called concurrently with events
+** being delivered.
+*/
+extern int ef_eventq_has_many_events(ef_vi* evq, int look_ahead);
+
+/*! Type of function to handle unknown events arriving on event queue
+**  Return CI_TRUE iff the event has been handled.
+*/
+typedef int/*bool*/ ef_event_handler_fn(void* priv, ef_vi* evq, ef_event* ev);
+
+/*! Standard poll exception routine */
+extern int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq,
+                                            ef_event* ev);
+
+/*! \i_ef_event  Retrieve events from the event queue, handle RX/TX events
+**  and pass any others to an exception handler function
+**
+**   \return The number of events retrieved.
+*/
+extern int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len,
+                              ef_event_handler_fn *exception, void *expt_priv);
+
+/*! \i_ef_event  Retrieve events from the event queue.
+**
+**   \return The number of events retrieved.
+*/
+ef_vi_inline int ef_eventq_poll(ef_vi* evq, ef_event* evs, int evs_len)
+{
+	return ef_eventq_poll_evs(evq, evs, evs_len,
+                            &ef_eventq_poll_exception, (void*)0);
+}
+
+/*! \i_ef_event Returns the capacity of an event queue. */
+ef_vi_inline int ef_eventq_capacity(ef_vi* vi) 
+{
+	return (vi->evq_mask + 1u) / sizeof(ef_hw_event);
+}
+
+/* Returns the instance ID of [vi] */
+ef_vi_inline unsigned ef_vi_instance(ef_vi* vi)
+{ return vi->vi_i; }
+
+
+/**********************************************************************
+ * Initialisation *****************************************************
+ **********************************************************************/
+
+/*! Return size of state buffer of an initialised VI. */
+extern int ef_vi_state_bytes(ef_vi*);
+
+/*! Return size of buffer needed for VI state given sizes of RX and TX
+** DMA queues.  Queue sizes must be legal sizes (power of 2), or 0 (no
+** queue).
+*/
+extern int ef_vi_calc_state_bytes(int rxq_size, int txq_size);
+
+/*! Initialise [ef_vi] from the provided resources. [vvis] must have been
+** created by ef_make_vi_data() & remains owned by the caller.
+*/
+extern void ef_vi_init(ef_vi*, void* vi_info, ef_vi_state* state,
+                       ef_eventq_state* evq_state, enum ef_vi_flags);
+
+extern void ef_vi_state_init(ef_vi*);
+extern void ef_eventq_state_init(ef_vi*);
+
+/*! Convert an efhw device arch to ef_vi_arch, or returns -1 if not
+** recognised.
+*/
+extern int  ef_vi_arch_from_efhw_arch(int efhw_arch);
+
+
+#endif /* __EFAB_EF_VI_H__ */
diff --git a/drivers/xen/sfc_netfront/falcon_event.c b/drivers/xen/sfc_netfront/falcon_event.c
new file mode 100644
index 0000000..dd9cc15
--- /dev/null
+++ b/drivers/xen/sfc_netfront/falcon_event.c
@@ -0,0 +1,346 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Routine to poll event queues.
+ *   \date  2003/03/04
+ */
+
+/*! \cidoxg_lib_ef */
+#include "ef_vi_internal.h"
+
+/* Be worried about this on byteswapped machines */
+/* Due to crazy chipsets, we see the event words being written in
+** arbitrary order (bug4539).  So test for presence of event must ensure
+** that both halves have changed from the null.
+*/
+# define EF_VI_IS_EVENT(evp)						\
+	( (((evp)->opaque.a != (uint32_t)-1) &&				\
+	   ((evp)->opaque.b != (uint32_t)-1)) )
+
+
+#ifdef NDEBUG
+# define IS_DEBUG 0
+#else
+# define IS_DEBUG 1
+#endif
+
+
+/*! Check for RX events with inconsistent SOP/CONT
+**
+** Returns true if this event should be discarded
+*/
+ef_vi_inline int ef_eventq_is_rx_sop_cont_bad_efab(ef_vi* vi,
+						   const ef_vi_qword* ev)
+{
+	ef_rx_dup_state_t* rx_dup_state;
+	uint8_t* bad_sop;
+
+	unsigned label = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+	unsigned sop   = QWORD_TEST_BIT(RX_SOP, *ev);
+  
+	ef_assert(vi);
+	ef_assert_lt(label, EFAB_DMAQS_PER_EVQ_MAX);
+
+	rx_dup_state = &vi->evq_state->rx_dup_state[label];
+	bad_sop = &rx_dup_state->bad_sop;
+
+	if( ! ((vi->vi_flags & EF_VI_BUG5692_WORKAROUND) || IS_DEBUG) ) {
+		*bad_sop = (*bad_sop && !sop);
+	}
+	else {
+		unsigned cont  = QWORD_TEST_BIT(RX_JUMBO_CONT, *ev);
+		uint8_t *frag_num = &rx_dup_state->frag_num;
+
+		/* bad_sop should latch till the next sop */
+		*bad_sop = (*bad_sop && !sop) || ( !!sop != (*frag_num==0) );
+
+		/* we do not check the number of bytes relative to the
+		 * fragment number and size of the user rx buffer here
+		 * because we don't know the size of the user rx
+		 * buffer - we probably should perform this check in
+		 * the nearest code calling this though.
+		 */
+		*frag_num = cont ? (*frag_num + 1) : 0;
+	}
+
+	return *bad_sop;
+}
+
+
+ef_vi_inline int falcon_rx_check_dup(ef_vi* evq, ef_event* ev_out,
+				     const ef_vi_qword* ev)
+{
+	unsigned q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+	uint16_t desc_ptr = QWORD_GET_U(RX_EV_DESC_PTR, *ev);
+	ef_rx_dup_state_t* rx_dup_state = &evq->evq_state->rx_dup_state[q_id];
+
+	if(likely( desc_ptr != rx_dup_state->rx_last_desc_ptr )) {
+		rx_dup_state->rx_last_desc_ptr = desc_ptr;
+		return 0;
+	}
+
+	rx_dup_state->rx_last_desc_ptr = desc_ptr;
+	rx_dup_state->bad_sop = 1;
+#ifndef NDEBUG
+	rx_dup_state->frag_num = 0;
+#endif
+	BUG_ON(!QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev));
+	BUG_ON( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev));
+	BUG_ON(!QWORD_GET_U(RX_EV_BYTE_CNT, *ev) == 0);
+	ev_out->rx_no_desc_trunc.type = EF_EVENT_TYPE_RX_NO_DESC_TRUNC;
+	ev_out->rx_no_desc_trunc.q_id = q_id;
+	return 1;
+}
+
+
+ef_vi_inline void falcon_rx_event(ef_event* ev_out, const ef_vi_qword* ev)
+{
+	if(likely( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev) )) {
+		ev_out->rx.type = EF_EVENT_TYPE_RX;
+		ev_out->rx.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+		ev_out->rx.len  = QWORD_GET_U(RX_EV_BYTE_CNT, *ev);
+		if( QWORD_TEST_BIT(RX_SOP, *ev) )
+			ev_out->rx.flags = EF_EVENT_FLAG_SOP;
+		else
+			ev_out->rx.flags = 0;
+		if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) )
+			ev_out->rx.flags |= EF_EVENT_FLAG_CONT;
+		if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) )
+			ev_out->rx.flags |= EF_EVENT_FLAG_ISCSI_OK;
+	}
+	else {
+		ev_out->rx_discard.type = EF_EVENT_TYPE_RX_DISCARD;
+		ev_out->rx_discard.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+		ev_out->rx_discard.len  = QWORD_GET_U(RX_EV_BYTE_CNT, *ev);
+#if 1  /* hack for ptloop compatability: ?? TODO purge */
+		if( QWORD_TEST_BIT(RX_SOP, *ev) )
+			ev_out->rx_discard.flags = EF_EVENT_FLAG_SOP;
+		else
+			ev_out->rx_discard.flags = 0;
+		if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) )
+			ev_out->rx_discard.flags |= EF_EVENT_FLAG_CONT;
+		if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) )
+			ev_out->rx_discard.flags |= EF_EVENT_FLAG_ISCSI_OK;
+#endif
+		/* Order matters here: more fundamental errors first. */
+		if( QWORD_TEST_BIT(RX_EV_BUF_OWNER_ID_ERR, *ev) )
+			ev_out->rx_discard.subtype = 
+				EF_EVENT_RX_DISCARD_RIGHTS;
+		else if( QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev) )
+			ev_out->rx_discard.subtype = 
+				EF_EVENT_RX_DISCARD_TRUNC;
+		else if( QWORD_TEST_BIT(RX_EV_ETH_CRC_ERR, *ev) )
+			ev_out->rx_discard.subtype = 
+				EF_EVENT_RX_DISCARD_CRC_BAD;
+		else if( QWORD_TEST_BIT(RX_EV_IP_HDR_CHKSUM_ERR, *ev) )
+			ev_out->rx_discard.subtype = 
+				EF_EVENT_RX_DISCARD_CSUM_BAD;
+		else if( QWORD_TEST_BIT(RX_EV_TCP_UDP_CHKSUM_ERR, *ev) )
+			ev_out->rx_discard.subtype = 
+				EF_EVENT_RX_DISCARD_CSUM_BAD;
+		else
+			ev_out->rx_discard.subtype = 
+				EF_EVENT_RX_DISCARD_OTHER;
+	}
+}
+
+
+ef_vi_inline void falcon_tx_event(ef_event* ev_out, const ef_vi_qword* ev)
+{
+	/* Danger danger!  No matter what we ask for wrt batching, we
+	** will get a batched event every 16 descriptors, and we also
+	** get dma-queue-empty events.  i.e. Duplicates are expected.
+	**
+	** In addition, if it's been requested in the descriptor, we
+	** get an event per descriptor.  (We don't currently request
+	** this).
+	*/
+	if(likely( QWORD_TEST_BIT(TX_EV_COMP, *ev) )) {
+		ev_out->tx.type = EF_EVENT_TYPE_TX;
+		ev_out->tx.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev);
+	}
+	else {
+		ev_out->tx_error.type = EF_EVENT_TYPE_TX_ERROR;
+		ev_out->tx_error.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev);
+		if(likely( QWORD_TEST_BIT(TX_EV_BUF_OWNER_ID_ERR, *ev) ))
+			ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_RIGHTS;
+		else if(likely( QWORD_TEST_BIT(TX_EV_WQ_FF_FULL, *ev) ))
+			ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_OFLOW;
+		else if(likely( QWORD_TEST_BIT(TX_EV_PKT_TOO_BIG, *ev) ))
+			ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_2BIG;
+		else if(likely( QWORD_TEST_BIT(TX_EV_PKT_ERR, *ev) ))
+			ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_BUS;
+	}
+}
+
+
+static void mark_bad(ef_event* ev)
+{
+	ev->generic.ev.u64[0] &=~ ((uint64_t) 1u << RX_EV_PKT_OK_LBN);
+}
+
+
+int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len,
+		       ef_event_handler_fn *exception, void *expt_priv)
+{
+	int evs_len_orig = evs_len;
+
+	EF_VI_CHECK_EVENT_Q(evq);
+	ef_assert(evs);
+	ef_assert_gt(evs_len, 0);
+
+	if(unlikely( EF_VI_IS_EVENT(EF_VI_EVENT_PTR(evq, 1)) ))
+		goto overflow;
+
+	do {
+		{ /* Read the event out of the ring, then fiddle with
+		   * copied version.  Reason is that the ring is
+		   * likely to get pushed out of cache by another
+		   * event being delivered by hardware. */
+			ef_vi_event* ev = EF_VI_EVENT_PTR(evq, 0);
+			if( ! EF_VI_IS_EVENT(ev) )
+				break;
+			evs->generic.ev.u64[0] = cpu_to_le64 (ev->u64);
+			evq->evq_state->evq_ptr += sizeof(ef_vi_event);
+			ev->u64 = (uint64_t)(int64_t) -1;
+		}
+
+		/* Ugly: Exploit the fact that event code lies in top
+		 * bits of event. */
+		ef_assert_ge(EV_CODE_LBN, 32u);
+		switch( evs->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) {
+		case RX_IP_EV_DECODE:
+			/* Look for duplicate desc_ptr: it signals
+			 * that a jumbo frame was truncated because we
+			 * ran out of descriptors. */
+			if(unlikely( falcon_rx_check_dup
+					   (evq, evs, &evs->generic.ev) )) {
+				--evs_len;
+				++evs;
+				break;
+			}
+			else {
+				/* Cope with FalconA1 bugs where RX
+				 * gives inconsistent RX events Mark
+				 * events as bad until SOP becomes
+				 * consistent again
+				 * ef_eventq_is_rx_sop_cont_bad() has
+				 * side effects - order is important
+				 */
+				if(unlikely
+				   (ef_eventq_is_rx_sop_cont_bad_efab
+				    (evq, &evs->generic.ev) )) {
+					mark_bad(evs);
+				}
+			}
+			falcon_rx_event(evs, &evs->generic.ev);
+			--evs_len;	
+			++evs;
+			break;
+
+		case TX_IP_EV_DECODE:
+			falcon_tx_event(evs, &evs->generic.ev);
+			--evs_len;
+			++evs;
+			break;
+
+		default:
+			break;
+		}
+	} while( evs_len );
+
+	return evs_len_orig - evs_len;
+
+
+ overflow:
+	evs->generic.type = EF_EVENT_TYPE_OFLOW;
+	evs->generic.ev.u64[0] = (uint64_t)((int64_t)-1);
+	return 1;
+}
+
+
+int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq, ef_event* ev)
+{
+	int /*bool*/ handled = 0;
+  
+	switch( ev->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) {
+	case DRIVER_EV_DECODE:
+		if( QWORD_GET_U(DRIVER_EV_SUB_CODE, ev->generic.ev) ==
+		    EVQ_INIT_DONE_EV_DECODE )
+			/* EVQ initialised event: ignore. */
+			handled = 1;
+		break;
+	}
+	return handled;
+}
+
+
+void ef_eventq_iterate(ef_vi* vi,
+		       void (*fn)(void* arg, ef_vi*, int rel_pos,
+				  int abs_pos, void* event),
+		       void* arg, int stop_at_end)
+{
+	int i, size_evs = (vi->evq_mask + 1) / sizeof(ef_vi_event);
+
+	for( i = 0; i < size_evs; ++i ) {
+		ef_vi_event* e = EF_VI_EVENT_PTR(vi, -i);
+		if( EF_VI_IS_EVENT(e) )
+			fn(arg, vi, i, 
+			   EF_VI_EVENT_OFFSET(vi, -i) / sizeof(ef_vi_event),
+			   e);
+		else if( stop_at_end )
+			break;
+	}
+}
+
+
+int ef_eventq_has_event(ef_vi* vi)
+{
+	return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, 0));
+}
+
+
+int ef_eventq_has_many_events(ef_vi* vi, int look_ahead)
+{
+	ef_assert_ge(look_ahead, 0);
+	return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, -look_ahead));
+}
+
+
+int ef_eventq_has_rx_event(ef_vi* vi)
+{
+	ef_vi_event* ev;
+	int i, n_evs = 0;
+
+	for( i = 0;  EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, i)); --i ) {
+		ev = EF_VI_EVENT_PTR(vi, i);
+		if( EFVI_FALCON_EVENT_CODE(ev) == EF_EVENT_TYPE_RX )  n_evs++;
+	}
+	return n_evs;
+}
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netfront/falcon_vi.c b/drivers/xen/sfc_netfront/falcon_vi.c
new file mode 100644
index 0000000..b6880e9
--- /dev/null
+++ b/drivers/xen/sfc_netfront/falcon_vi.c
@@ -0,0 +1,473 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr, stg
+ *  \brief  Falcon-specific VI
+ *   \date  2006/11/30
+ */
+
+#include "ef_vi_internal.h"
+
+
+#define EFVI_FALCON_DMA_TX_FRAG		1
+
+
+/* TX descriptor for both physical and virtual packet transfers */
+typedef union {
+	uint32_t	dword[2];
+} ef_vi_falcon_dma_tx_buf_desc;
+typedef ef_vi_falcon_dma_tx_buf_desc ef_vi_falcon_dma_tx_phys_desc;
+
+
+/* RX descriptor for physical addressed transfers */
+typedef union {
+	uint32_t	dword[2];
+} ef_vi_falcon_dma_rx_phys_desc;
+
+
+/* RX descriptor for virtual packet transfers */
+typedef struct {
+	uint32_t	dword[1];
+} ef_vi_falcon_dma_rx_buf_desc;
+
+/* Buffer table index */
+typedef uint32_t		ef_vi_buffer_addr_t;
+
+ef_vi_inline int64_t dma_addr_to_u46(int64_t src_dma_addr)
+{
+	return (src_dma_addr & __FALCON_MASK(46, int64_t));
+}
+
+/*! Setup a physical address based descriptor with a specified length */
+ef_vi_inline void
+__falcon_dma_rx_calc_ip_phys(ef_vi_dma_addr_t dest_pa, 
+			     ef_vi_falcon_dma_rx_phys_desc *desc,
+			     int bytes)
+{
+	int region = 0; 		/* TODO fixme */
+	int64_t dest    = dma_addr_to_u46(dest_pa); /* lower 46 bits */
+
+	DWCHCK(__DW2(RX_KER_BUF_SIZE_LBN),  RX_KER_BUF_SIZE_WIDTH);
+	DWCHCK(__DW2(RX_KER_BUF_REGION_LBN),RX_KER_BUF_REGION_WIDTH);
+
+	LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH);
+
+	RANGECHCK(bytes,  RX_KER_BUF_SIZE_WIDTH);
+	RANGECHCK(region, RX_KER_BUF_REGION_WIDTH);
+
+	ef_assert(desc);
+
+	desc->dword[1] = ((bytes << __DW2(RX_KER_BUF_SIZE_LBN)) |
+			  (region << __DW2(RX_KER_BUF_REGION_LBN)) |
+			  (HIGH(dest,
+				RX_KER_BUF_ADR_LBN, 
+				RX_KER_BUF_ADR_WIDTH)));
+
+	desc->dword[0] = LOW(dest, 
+			     RX_KER_BUF_ADR_LBN, 
+			     RX_KER_BUF_ADR_WIDTH);
+}
+
+/*! Setup a virtual buffer descriptor for an IPMODE transfer */
+ef_vi_inline void
+__falcon_dma_tx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs, unsigned bytes,
+			    int port, int frag, 
+			    ef_vi_falcon_dma_tx_buf_desc *desc)
+{
+	DWCHCK(__DW2(TX_USR_PORT_LBN), TX_USR_PORT_WIDTH);
+	DWCHCK(__DW2(TX_USR_CONT_LBN), TX_USR_CONT_WIDTH);
+	DWCHCK(__DW2(TX_USR_BYTE_CNT_LBN), TX_USR_BYTE_CNT_WIDTH);
+	LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH);
+	DWCHCK(TX_USR_BYTE_OFS_LBN, TX_USR_BYTE_OFS_WIDTH);
+
+	RANGECHCK(bytes,   TX_USR_BYTE_CNT_WIDTH);
+	RANGECHCK(port,    TX_USR_PORT_WIDTH);
+	RANGECHCK(frag,    TX_USR_CONT_WIDTH);
+	RANGECHCK(buf_id,  TX_USR_BUF_ID_WIDTH);
+	RANGECHCK(buf_ofs, TX_USR_BYTE_OFS_WIDTH);
+
+	ef_assert(desc);
+
+	desc->dword[1] = ((port   <<  __DW2(TX_USR_PORT_LBN))      | 
+			  (frag   <<  __DW2(TX_USR_CONT_LBN))      | 
+			  (bytes  <<  __DW2(TX_USR_BYTE_CNT_LBN))  |
+			  (HIGH(buf_id, 
+				TX_USR_BUF_ID_LBN,
+				TX_USR_BUF_ID_WIDTH)));
+
+	desc->dword[0] =  ((LOW(buf_id,
+				TX_USR_BUF_ID_LBN,
+				(TX_USR_BUF_ID_WIDTH))) |
+			   (buf_ofs << TX_USR_BYTE_OFS_LBN));
+}
+
+ef_vi_inline void
+falcon_dma_tx_calc_ip_buf_4k(unsigned buf_vaddr, unsigned bytes,
+			     int port, int frag, 
+			     ef_vi_falcon_dma_tx_buf_desc *desc)
+{
+	/* TODO FIXME [buf_vaddr] consists of the buffer index in the
+	** high bits, and an offset in the low bits. Assumptions
+	** permate the code that these can be rolled into one 32bit
+	** value, so this is currently preserved for Falcon. But we
+	** should change to support 8K pages
+	*/
+	unsigned buf_id =  EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr);
+	unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr);
+
+	__falcon_dma_tx_calc_ip_buf( buf_id, buf_ofs, bytes, port, frag, desc);
+}
+
+ef_vi_inline void
+falcon_dma_tx_calc_ip_buf(unsigned buf_vaddr, unsigned bytes, int port, 
+			  int frag, ef_vi_falcon_dma_tx_buf_desc *desc)
+{
+	falcon_dma_tx_calc_ip_buf_4k(buf_vaddr, bytes, port, frag, desc);
+}
+
+/*! Setup a virtual buffer based descriptor */
+ef_vi_inline void
+__falcon_dma_rx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs, 
+			    ef_vi_falcon_dma_rx_buf_desc *desc)
+{ 
+	/* check alignment of buffer offset and pack */
+	ef_assert((buf_ofs & 0x1) == 0);
+
+	buf_ofs >>= 1;
+
+	DWCHCK(RX_USR_2BYTE_OFS_LBN, RX_USR_2BYTE_OFS_WIDTH);
+	DWCHCK(RX_USR_BUF_ID_LBN, RX_USR_BUF_ID_WIDTH);
+
+	RANGECHCK(buf_ofs, RX_USR_2BYTE_OFS_WIDTH);
+	RANGECHCK(buf_id,  RX_USR_BUF_ID_WIDTH);
+
+	ef_assert(desc);
+
+	desc->dword[0] = ((buf_ofs << RX_USR_2BYTE_OFS_LBN) | 
+			  (buf_id  << RX_USR_BUF_ID_LBN));
+}
+
+ef_vi_inline void
+falcon_dma_rx_calc_ip_buf_4k(unsigned buf_vaddr, 
+			     ef_vi_falcon_dma_rx_buf_desc *desc)
+{ 
+	/* TODO FIXME [buf_vaddr] consists of the buffer index in the
+	** high bits, and an offset in the low bits. Assumptions
+	** permeate the code that these can be rolled into one 32bit
+	** value, so this is currently preserved for Falcon. But we
+	** should change to support 8K pages
+	*/
+	unsigned buf_id =  EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr);
+	unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr);
+
+	__falcon_dma_rx_calc_ip_buf(buf_id, buf_ofs, desc);
+}
+
+ef_vi_inline void
+falcon_dma_rx_calc_ip_buf(unsigned buf_vaddr, 
+			  ef_vi_falcon_dma_rx_buf_desc *desc)
+{ 
+	falcon_dma_rx_calc_ip_buf_4k(buf_vaddr, desc);
+}
+
+
+ef_vi_inline ef_vi_dma_addr_t ef_physaddr(ef_addr efaddr)
+{
+	return (ef_vi_dma_addr_t) efaddr;
+}
+
+
+/*! Convert between an ef_addr and a buffer table index
+**  Assert that this was not a physical address
+*/
+ef_vi_inline ef_vi_buffer_addr_t ef_bufaddr(ef_addr efaddr)
+{
+	ef_assert(efaddr < ((uint64_t)1 << 32) );
+
+	return (ef_vi_buffer_addr_t) efaddr;
+}
+
+
+/*! Setup an physical address based descriptor for an IPMODE transfer */
+ef_vi_inline void
+falcon_dma_tx_calc_ip_phys(ef_vi_dma_addr_t src_dma_addr, unsigned bytes, 
+			   int port, int frag,
+			   ef_vi_falcon_dma_tx_phys_desc *desc)
+{
+
+	int region = 0; /* FIXME */
+	int64_t src    = dma_addr_to_u46(src_dma_addr); /* lower 46 bits */
+
+	DWCHCK(__DW2(TX_KER_PORT_LBN),      TX_KER_PORT_WIDTH);
+	DWCHCK(__DW2(TX_KER_CONT_LBN),      TX_KER_CONT_WIDTH);
+	DWCHCK(__DW2(TX_KER_BYTE_CNT_LBN),  TX_KER_BYTE_CNT_WIDTH);
+	DWCHCK(__DW2(TX_KER_BUF_REGION_LBN),TX_KER_BUF_REGION_WIDTH);
+
+	LWCHK(TX_KER_BUF_ADR_LBN, TX_KER_BUF_ADR_WIDTH);
+
+	RANGECHCK(port,   TX_KER_PORT_WIDTH);
+	RANGECHCK(frag,   TX_KER_CONT_WIDTH);
+	RANGECHCK(bytes,  TX_KER_BYTE_CNT_WIDTH);
+	RANGECHCK(region, TX_KER_BUF_REGION_WIDTH);
+
+	desc->dword[1] = ((port   <<  __DW2(TX_KER_PORT_LBN))      | 
+			  (frag   <<  __DW2(TX_KER_CONT_LBN))      | 
+			  (bytes  <<  __DW2(TX_KER_BYTE_CNT_LBN))  | 
+			  (region << __DW2(TX_KER_BUF_REGION_LBN)) |
+			  (HIGH(src,
+				TX_KER_BUF_ADR_LBN, 
+				TX_KER_BUF_ADR_WIDTH)));
+
+	ef_assert_equal(TX_KER_BUF_ADR_LBN, 0);
+	desc->dword[0] = (uint32_t) src_dma_addr;
+}
+
+
+void falcon_vi_init(ef_vi* vi, void* vvis)
+{
+	struct vi_mappings *vm = (struct vi_mappings*)vvis;
+	uint16_t* ids;
+
+	ef_assert(vi);
+	ef_assert(vvis);
+	ef_assert_equal(vm->signature, VI_MAPPING_SIGNATURE);
+	ef_assert_equal(vm->nic_type.arch, EF_VI_ARCH_FALCON);
+
+	/* Initialise masks to zero, so that ef_vi_state_init() will
+	** not do any harm when we don't have DMA queues. */
+	vi->vi_rxq.mask = vi->vi_txq.mask = 0;
+
+	/* Used for BUG5391_WORKAROUND. */
+	vi->vi_txq.misalign_mask = 0;
+
+	/* Initialise doorbell addresses to a distinctive small value
+	** which will cause a segfault, to trap doorbell pushes to VIs
+	** without DMA queues. */
+	vi->vi_rxq.doorbell = vi->vi_txq.doorbell = (ef_vi_ioaddr_t)0xdb;
+
+	ids = (uint16_t*) (vi->ep_state + 1);
+
+	if( vm->tx_queue_capacity ) {
+		vi->vi_txq.mask = vm->tx_queue_capacity - 1;
+		vi->vi_txq.doorbell = vm->tx_bell + 12;
+		vi->vi_txq.descriptors = vm->tx_dma_falcon;
+		vi->vi_txq.ids = ids;
+		ids += vi->vi_txq.mask + 1;
+		/* Check that the id fifo fits in the space allocated. */
+		ef_assert_le((char*) (vi->vi_txq.ids + vm->tx_queue_capacity),
+			     (char*) vi->ep_state
+			     + ef_vi_calc_state_bytes(vm->rx_queue_capacity,
+						      vm->tx_queue_capacity));
+	}
+	if( vm->rx_queue_capacity ) {
+		vi->vi_rxq.mask = vm->rx_queue_capacity - 1;
+		vi->vi_rxq.doorbell = vm->rx_bell + 12;
+		vi->vi_rxq.descriptors = vm->rx_dma_falcon;
+		vi->vi_rxq.ids = ids;
+		/* Check that the id fifo fits in the space allocated. */
+		ef_assert_le((char*) (vi->vi_rxq.ids + vm->rx_queue_capacity),
+			     (char*) vi->ep_state
+			     + ef_vi_calc_state_bytes(vm->rx_queue_capacity,
+						      vm->tx_queue_capacity));
+	}
+
+	if( vm->nic_type.variant == 'A' ) {
+		vi->vi_txq.misalign_mask = 15;    /* BUG5391_WORKAROUND */
+		vi->vi_flags |= EF_VI_BUG5692_WORKAROUND;
+	}
+}
+
+
+int ef_vi_transmitv_init(ef_vi* vi, const ef_iovec* iov, int iov_len,
+			 ef_request_id dma_id)
+{
+	ef_vi_txq* q = &vi->vi_txq;
+	ef_vi_txq_state* qs = &vi->ep_state->txq;
+	ef_vi_falcon_dma_tx_buf_desc* dp;
+	unsigned len, dma_len, di;
+	unsigned added_save = qs->added;
+	ef_addr dma_addr;
+	unsigned last_len = 0;
+
+	ef_assert(iov_len > 0);
+	ef_assert(iov);
+	ef_assert_equal((dma_id & EF_REQUEST_ID_MASK), dma_id);
+	ef_assert_nequal(dma_id, 0xffff);
+
+	dma_addr = iov->iov_base;
+	len = iov->iov_len;
+
+	if( vi->vi_flags & EF_VI_ISCSI_TX_DDIG ) {
+		/* Last 4 bytes of placeholder for digest must be
+		 * removed for h/w */
+		ef_assert(len > 4);
+		last_len = iov[iov_len - 1].iov_len;
+		if( last_len <= 4 ) {
+			ef_assert(iov_len > 1);
+			--iov_len;
+			last_len = iov[iov_len - 1].iov_len - (4 - last_len);
+		}
+		else {
+			last_len = iov[iov_len - 1].iov_len - 4;
+		}
+		if( iov_len == 1 )
+			len = last_len;
+	}
+
+	while( 1 ) {
+		if( qs->added - qs->removed >= q->mask ) {
+			qs->added = added_save;
+			return -EAGAIN;
+		}
+
+		dma_len = (~((unsigned) dma_addr) & 0xfff) + 1;
+		if( dma_len > len )  dma_len = len;
+		{ /* BUG5391_WORKAROUND */
+			unsigned misalign = 
+				(unsigned) dma_addr & q->misalign_mask;
+			if( misalign && dma_len + misalign > 512 )
+				dma_len = 512 - misalign;
+		}
+
+		di = qs->added++ & q->mask;
+		dp = (ef_vi_falcon_dma_tx_buf_desc*) q->descriptors + di;
+		if( vi->vi_flags & EF_VI_TX_PHYS_ADDR )
+			falcon_dma_tx_calc_ip_phys
+				(ef_physaddr(dma_addr), dma_len, /*port*/ 0,
+				 (iov_len == 1 && dma_len == len) ? 0 :
+				 EFVI_FALCON_DMA_TX_FRAG, dp);
+		else
+			falcon_dma_tx_calc_ip_buf
+				(ef_bufaddr(dma_addr), dma_len, /*port*/ 0,
+				 (iov_len == 1 && dma_len == len) ? 0 :
+				 EFVI_FALCON_DMA_TX_FRAG, dp);
+
+		dma_addr += dma_len;
+		len -= dma_len;
+
+		if( len == 0 ) {
+			if( --iov_len == 0 )  break;
+			++iov;
+			dma_addr = iov->iov_base;
+			len = iov->iov_len;
+			if( (vi->vi_flags & EF_VI_ISCSI_TX_DDIG) &&
+			    (iov_len == 1) )
+				len = last_len;
+		}
+	}
+
+	q->ids[di] = (uint16_t) dma_id;
+	return 0;
+}
+
+
+void ef_vi_transmit_push(ef_vi* vi)
+{
+	ef_vi_wiob();
+	writel((vi->ep_state->txq.added & vi->vi_txq.mask) <<
+		__DW4(TX_DESC_WPTR_LBN),
+        	vi->vi_txq.doorbell);
+}
+
+
+/*! The value of initial_rx_bytes is used to set RX_KER_BUF_SIZE in an initial
+**  receive descriptor here if physical addressing is being used. A value of
+**  zero represents 16384 bytes.  This is okay, because caller must provide a
+**  buffer than is > MTU, and mac should filter anything bigger than that.
+*/
+int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id,
+		       int initial_rx_bytes)
+{
+	ef_vi_rxq* q = &vi->vi_rxq;
+	ef_vi_rxq_state* qs = &vi->ep_state->rxq;
+	unsigned di;
+
+	if( ef_vi_receive_space(vi) ) {
+		di = qs->added++ & q->mask;
+		ef_assert_equal(q->ids[di], 0xffff);
+		q->ids[di] = (uint16_t) dma_id;
+
+		if( ! (vi->vi_flags & EF_VI_RX_PHYS_ADDR) ) {
+			ef_vi_falcon_dma_rx_buf_desc* dp;
+			dp = (ef_vi_falcon_dma_rx_buf_desc*) 
+				q->descriptors + di;
+			falcon_dma_rx_calc_ip_buf(ef_bufaddr(addr), dp);
+		}
+		else {
+			ef_vi_falcon_dma_rx_phys_desc* dp;
+			dp = (ef_vi_falcon_dma_rx_phys_desc*) 
+				q->descriptors + di;
+			__falcon_dma_rx_calc_ip_phys(addr, dp,
+						     initial_rx_bytes);
+		}
+
+		return 0;
+	}
+
+	return -EAGAIN;
+}
+
+
+int ef_vi_receive_post(ef_vi* vi, ef_addr addr, ef_request_id dma_id)
+{
+  int rc = ef_vi_receive_init(vi, addr, dma_id, 0);
+  if( rc == 0 )  ef_vi_receive_push(vi);
+  return rc;
+}
+
+
+void ef_vi_receive_push(ef_vi* vi)
+{
+	ef_vi_wiob();
+	writel ((vi->ep_state->rxq.added & vi->vi_rxq.mask) <<
+		__DW4(RX_DESC_WPTR_LBN),
+		vi->vi_rxq.doorbell);
+}
+
+
+ef_request_id ef_vi_receive_done(const ef_vi* vi, const ef_event* ef_ev)
+{
+	const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev);
+	unsigned di = ev->u32[0] & vi->vi_rxq.mask;
+	ef_request_id rq_id;
+
+	ef_assert(EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX ||
+		  EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX_DISCARD);
+
+	/* Detect spurious / duplicate RX events.  We may need to modify this
+	** code so that we are robust if they happen. */
+	ef_assert_equal(di, vi->ep_state->rxq.removed & vi->vi_rxq.mask);
+
+	/* We only support 1 port: so events should be in order. */
+	ef_assert(vi->vi_rxq.ids[di] != 0xffff);
+
+	rq_id = vi->vi_rxq.ids[di];
+	vi->vi_rxq.ids[di] = 0xffff;
+	++vi->ep_state->rxq.removed;
+	return rq_id;
+}
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netfront/pt_tx.c b/drivers/xen/sfc_netfront/pt_tx.c
new file mode 100644
index 0000000..bdc1f88
--- /dev/null
+++ b/drivers/xen/sfc_netfront/pt_tx.c
@@ -0,0 +1,91 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Packet-mode transmit interface.
+ *   \date  2003/04/02
+ */
+
+/*! \cidoxg_lib_ef */
+#include "ef_vi_internal.h"
+
+
+int ef_vi_transmit_init(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id)
+{
+	ef_iovec iov = { base, len };
+	return ef_vi_transmitv_init(vi, &iov, 1, dma_id);
+}
+
+
+int ef_vi_transmit(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id)
+{
+	ef_iovec iov = { base, len };
+	int rc = ef_vi_transmitv_init(vi, &iov, 1, dma_id);
+	if( rc == 0 )  ef_vi_transmit_push(vi);
+	return rc;
+}
+
+
+int ef_vi_transmitv(ef_vi* vi, const ef_iovec* iov, int iov_len,
+                    ef_request_id dma_id)
+{
+	int rc = ef_vi_transmitv_init(vi, iov, iov_len, dma_id);
+	if( rc == 0 )  ef_vi_transmit_push(vi);
+	return rc;
+}
+
+
+int ef_vi_transmit_unbundle(ef_vi* vi, const ef_event* __ev,
+			    ef_request_id* ids)
+{
+	ef_request_id* ids_in = ids;
+	ef_vi_txq* q = &vi->vi_txq;
+	ef_vi_txq_state* qs = &vi->ep_state->txq;
+	const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*__ev);
+	unsigned i, stop = (ev->u32[0] + 1) & q->mask;
+
+	ef_assert(EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX ||
+		  EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX_ERROR);
+
+	/* Shouldn't be batching more than 64 descriptors, and should not go
+	** backwards. */
+	ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask), 64);
+	/* Should not complete more than we've posted. */
+	ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask),
+		     qs->added - qs->removed);
+
+	for( i = qs->removed & q->mask; i != stop; i = ++qs->removed & q->mask )
+		if( q->ids[i] != 0xffff ) {
+			*ids++ = q->ids[i];
+			q->ids[i] = 0xffff;
+		}
+
+	ef_assert_le(ids - ids_in, EF_VI_TRANSMIT_BATCH);
+
+	return (int) (ids - ids_in);
+}
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netfront/sysdep.h b/drivers/xen/sfc_netfront/sysdep.h
new file mode 100644
index 0000000..dc2234e
--- /dev/null
+++ b/drivers/xen/sfc_netfront/sysdep.h
@@ -0,0 +1,185 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  stg
+ *  \brief  System dependent support for ef vi lib
+ *   \date  2007/05/10
+ */
+
+/*! \cidoxg_include_ci_ul */
+#ifndef __CI_CIUL_SYSDEP_LINUX_H__
+#define __CI_CIUL_SYSDEP_LINUX_H__
+
+
+#define ef_vi_wiob()  mmiowb()
+
+
+/**********************************************************************
+ * Kernel version compatability
+ */
+
+#if defined(__GNUC__)
+
+/* Linux kernel doesn't have stdint.h or [u]intptr_t. */
+# if !defined(LINUX_VERSION_CODE)
+#  include <linux/version.h>
+# endif
+# include <asm/io.h>
+
+/* In Linux 2.6.24, linux/types.h has uintptr_t */
+# if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#  if BITS_PER_LONG == 32
+   typedef __u32         uintptr_t;
+#  else
+   typedef __u64         uintptr_t;
+#  endif
+# endif
+
+/* But even 2.6.24 doesn't define intptr_t */
+# if BITS_PER_LONG == 32
+   typedef __s32         intptr_t;
+# else
+   typedef __s64         intptr_t;
+# endif
+
+# if defined(__ia64__)
+#  define EF_VI_PRIx64  "lx"
+# else
+#  define EF_VI_PRIx64  "llx"
+# endif
+
+# define EF_VI_HF __attribute__((visibility("hidden")))
+# define EF_VI_HV __attribute__((visibility("hidden")))
+
+# if defined(__i386__) || defined(__x86_64__)  /* GCC x86/x64 */
+   typedef unsigned long long ef_vi_dma_addr_t; 
+# endif
+#endif
+
+#ifndef mmiowb
+# if defined(__i386__) || defined(__x86_64__)
+#  define mmiowb()
+# elif defined(__ia64__)
+#  ifndef ia64_mfa
+#   define ia64_mfa() asm volatile ("mf.a" ::: "memory")
+#  endif
+#  define mmiowb ia64_mfa
+# else
+#  error "Need definition for mmiowb"
+# endif
+#endif
+
+#ifdef EFX_NOT_UPSTREAM
+
+/* Stuff for architectures/compilers not officially supported */
+
+#if !defined(__GNUC__)
+# if defined(__PPC__)  /* GCC, PPC */
+   typedef unsigned long     ef_vi_dma_addr_t;
+
+#  ifdef __powerpc64__
+#   ifdef CONFIG_SMP
+#    define CI_SMP_SYNC        "\n   eieio     \n"         /* memory cache sync */
+#    define CI_SMP_ISYNC       "\n   isync     \n"         /* instr cache sync */
+#   else
+#    define CI_SMP_SYNC
+#    define CI_SMP_ISYNC
+#   endif
+#  else	 /* for ppc32 systems */
+#   ifdef CONFIG_SMP
+#    define CI_SMP_SYNC        "\n   eieio     \n"
+#    define CI_SMP_ISYNC       "\n   sync      \n"
+#   else
+#    define CI_SMP_SYNC
+#    define CI_SMP_ISYNC
+#   endif
+#  endif
+
+# elif defined(__ia64__)  /* GCC, IA64 */
+   typedef unsigned long     ef_vi_dma_addr_t;
+# else
+#  error Unknown processor - GNU C
+# endif
+
+#elif defined(__PGI)
+# error PGI not supported 
+
+#elif defined(__INTEL_COMPILER)
+
+/* Intel compilers v7 claim to be very gcc compatible. */
+# if __INTEL_COMPILER >= 700
+#  if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91)
+#   define EF_VI_LIKELY(t)    __builtin_expect((t), 1)
+#   define EF_VI_UNLIKELY(t)  __builtin_expect((t), 0)
+#  endif
+# else
+#  error Old Intel compiler not supported.
+# endif
+
+#else
+# error Unknown compiler.
+#endif
+
+#endif
+
+
+# include <linux/errno.h>
+
+
+/**********************************************************************
+ * Extracting bit fields.
+ */
+
+#define _QWORD_GET_LOW(f, v)                                    \
+  (((v).u32[0] >> (f##_LBN)) & ((1u << f##_WIDTH) - 1u))
+#define _QWORD_GET_HIGH(f, v)                                           \
+  (((v).u32[1] >> (f##_LBN - 32u)) & ((1u << f##_WIDTH) - 1u))
+#define _QWORD_GET_ANY(f, v)                                            \
+  (((v).u64[0] >> f##_LBN) & (((uint64_t) 1u << f##_WIDTH) - 1u))
+
+#define QWORD_GET(f, v)                                                     \
+  ((f##_LBN + f##_WIDTH) <= 32u                                             \
+   ? _QWORD_GET_LOW(f, (v))                                                 \
+   : ((f##_LBN >= 32u) ? _QWORD_GET_HIGH(f, (v)) : _QWORD_GET_ANY(f, (v))))
+
+#define QWORD_GET_U(f, v)  ((unsigned) QWORD_GET(f, (v)))
+
+#define _QWORD_TEST_BIT_LOW(f, v)   ((v).u32[0] & (1u << (f##_LBN)))
+#define _QWORD_TEST_BIT_HIGH(f, v)  ((v).u32[1] & (1u << (f##_LBN - 32u)))
+
+#define QWORD_TEST_BIT(f, v)                                                  \
+  (f##_LBN < 32 ? _QWORD_TEST_BIT_LOW(f, (v)) : _QWORD_TEST_BIT_HIGH(f, (v)))
+
+
+
+
+#ifndef DECLSPEC_NORETURN
+/* normally defined on Windows to expand to a declaration that the
+   function will not return */
+# define DECLSPEC_NORETURN
+#endif
+
+#endif  /* __CI_CIUL_SYSDEP_LINUX_H__ */
diff --git a/drivers/xen/sfc_netfront/vi_init.c b/drivers/xen/sfc_netfront/vi_init.c
new file mode 100644
index 0000000..4e7e19b
--- /dev/null
+++ b/drivers/xen/sfc_netfront/vi_init.c
@@ -0,0 +1,183 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Initialisation of VIs.
+ *   \date  2007/06/08
+ */
+
+#include "ef_vi_internal.h"
+
+#define EF_VI_STATE_BYTES(rxq_sz, txq_sz)			\
+	(sizeof(ef_vi_state) + (rxq_sz) * sizeof(uint16_t)	\
+	 + (txq_sz) * sizeof(uint16_t))
+
+int ef_vi_calc_state_bytes(int rxq_sz, int txq_sz)
+{
+	ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz));
+	ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz));
+
+	return EF_VI_STATE_BYTES(rxq_sz, txq_sz);
+}
+
+
+int ef_vi_state_bytes(ef_vi* vi)
+{
+	int rxq_sz = 0, txq_sz = 0;
+	if( ef_vi_receive_capacity(vi) )
+		rxq_sz = ef_vi_receive_capacity(vi) + 1;
+	if( ef_vi_transmit_capacity(vi) )
+		txq_sz = ef_vi_transmit_capacity(vi) + 1;
+
+	ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz));
+	ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz));
+
+	return EF_VI_STATE_BYTES(rxq_sz, txq_sz);
+}
+
+
+void ef_eventq_state_init(ef_vi* evq)
+{
+	int j;
+
+	for (j = 0; j<EFAB_DMAQS_PER_EVQ_MAX; j++) {
+		ef_rx_dup_state_t *rx_dup_state =
+			&evq->evq_state->rx_dup_state[j];
+		rx_dup_state->bad_sop = 0;
+		rx_dup_state->rx_last_desc_ptr = -1;
+		rx_dup_state->frag_num = 0;
+	}
+
+	evq->evq_state->evq_ptr = 0;
+}
+
+
+void ef_vi_state_init(ef_vi* vi)
+{
+	ef_vi_state* state = vi->ep_state;
+	unsigned i;
+
+	state->txq.added = state->txq.removed = 0;
+	state->rxq.added = state->rxq.removed = 0;
+
+	if( vi->vi_rxq.mask )
+		for( i = 0; i <= vi->vi_rxq.mask; ++i )
+			vi->vi_rxq.ids[i] = (uint16_t) -1;
+	if( vi->vi_txq.mask )
+		for( i = 0; i <= vi->vi_txq.mask; ++i )
+			vi->vi_txq.ids[i] = (uint16_t) -1;
+}
+
+
+void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type nic_type,
+                            int instance, unsigned evq_bytes, void* base,
+                            void* timer_reg)
+{
+	struct vi_mappings* vm = (struct vi_mappings*) data_area;
+
+	vm->signature = VI_MAPPING_SIGNATURE;
+	vm->vi_instance = instance;
+	vm->nic_type = nic_type;
+	vm->evq_bytes = evq_bytes;
+	vm->evq_base = base;
+	vm->evq_timer_reg = timer_reg;
+}
+
+
+void ef_vi_init(ef_vi* vi, void* vvis, ef_vi_state* state,
+                ef_eventq_state* evq_state, enum ef_vi_flags vi_flags)
+{
+	struct vi_mappings* vm = (struct vi_mappings*) vvis;
+
+	vi->vi_i = vm->vi_instance;
+	vi->ep_state = state;
+	vi->vi_flags = vi_flags;
+
+	switch( vm->nic_type.arch ) {
+	case EF_VI_ARCH_FALCON:
+		falcon_vi_init(vi, vvis);
+		break;
+	default:
+		/* ?? TODO: We should return an error code. */
+		ef_assert(0);
+		break;
+	}
+
+	if( vm->evq_bytes ) {
+		vi->evq_state = evq_state;
+		vi->evq_mask = vm->evq_bytes - 1u;
+		vi->evq_base = vm->evq_base;
+		vi->evq_timer_reg = vm->evq_timer_reg;
+	}
+
+	EF_VI_MAGIC_SET(vi, EF_VI);
+}
+
+
+/* Initialise [data_area] with information required to initialise an ef_vi.
+ * In the following, an unused param should be set to NULL. Note the case
+ * marked (*) of [iobuf_mmap] for falcon/driver; for the normal driver this
+ * must be NULL.
+ *
+ * \param  data_area     [in,out] required, must ref at least VI_MAPPING_SIZE 
+ *                                bytes
+ * \param  io_mmap       [in] ef1,    required
+ *                            falcon, required
+ * \param  iobuf_mmap    [in] ef1,    unused
+ *                            falcon, required
+ */
+void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type nic_type,
+                           unsigned rxq_capacity, unsigned txq_capacity,
+                           int instance, void* io_mmap,
+                           void* iobuf_mmap_rx, void* iobuf_mmap_tx,
+                           enum ef_vi_flags vi_flags)
+{
+	struct vi_mappings* vm = (struct vi_mappings*) data_area;
+	int rx_desc_bytes, rxq_bytes;
+
+	ef_assert(rxq_capacity > 0 || txq_capacity > 0);
+	ef_assert(vm);
+	ef_assert(io_mmap);
+	ef_assert(iobuf_mmap_rx || iobuf_mmap_tx);
+
+	vm->signature = VI_MAPPING_SIGNATURE;
+	vm->vi_instance = instance;
+	vm->nic_type = nic_type;
+
+	rx_desc_bytes = (vi_flags & EF_VI_RX_PHYS_ADDR) ? 8 : 4;
+	rxq_bytes = rxq_capacity * rx_desc_bytes;
+	rxq_bytes = (rxq_bytes + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+
+	if( iobuf_mmap_rx == iobuf_mmap_tx )
+		iobuf_mmap_tx = (char*) iobuf_mmap_rx + rxq_bytes;
+
+	vm->rx_queue_capacity = rxq_capacity;
+	vm->rx_dma_falcon = iobuf_mmap_rx;
+	vm->rx_bell       = (char*) io_mmap + (RX_DESC_UPD_REG_KER_OFST & 4095);
+	vm->tx_queue_capacity = txq_capacity;
+	vm->tx_dma_falcon = iobuf_mmap_tx;
+	vm->tx_bell       = (char*) io_mmap + (TX_DESC_UPD_REG_KER_OFST & 4095);
+}
diff --git a/drivers/xen/sfc_netutil/Makefile b/drivers/xen/sfc_netutil/Makefile
new file mode 100644
index 0000000..3fce370
--- /dev/null
+++ b/drivers/xen/sfc_netutil/Makefile
@@ -0,0 +1,11 @@
+EXTRA_CFLAGS += -Idrivers/xen/sfc_netutil
+EXTRA_CFLAGS += -Werror
+
+ifdef GGOV
+EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
+endif
+
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) := sfc_netutil.o
+
+sfc_netutil-objs := accel_cuckoo_hash.o accel_msg_iface.o accel_util.o 
+
diff --git a/drivers/xen/sfc_netutil/accel_cuckoo_hash.c b/drivers/xen/sfc_netutil/accel_cuckoo_hash.c
new file mode 100644
index 0000000..00454cb
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_cuckoo_hash.c
@@ -0,0 +1,649 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/types.h> /* needed for linux/random.h */
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include "accel_cuckoo_hash.h"
+#include "accel_util.h"
+
+static inline int cuckoo_hash_key_compare(cuckoo_hash_table *hashtab,
+					  cuckoo_hash_key *key1, 
+					  cuckoo_hash_key *key2)
+{
+	return !memcmp(key1, key2, hashtab->key_length);
+}
+
+
+static inline void cuckoo_hash_key_set(cuckoo_hash_key *key1, 
+				       cuckoo_hash_key *key2)
+{
+	*key1 = *key2;
+}
+
+
+/*
+ * Sets hash function parameters.  Chooses "a" to be odd, 0 < a < 2^w
+ * where w is the length of the key
+ */
+static void set_hash_parameters(cuckoo_hash_table *hashtab)
+{
+ again:
+	hashtab->a0 = hashtab->a1 = 0;
+
+	/* Make sure random */
+	get_random_bytes(&hashtab->a0, hashtab->key_length);
+	get_random_bytes(&hashtab->a1, hashtab->key_length);
+
+	/* Make sure odd */
+	hashtab->a0 |= 1;
+	hashtab->a1 |= 1;
+
+	/* Being different is good */
+	if (hashtab->a0 != hashtab->a1)
+		return;
+		       
+	goto again;
+}
+
+int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits,
+		     unsigned key_length)
+{
+	char *table_mem;
+	unsigned length = 1 << length_bits;
+
+	BUG_ON(length_bits >= sizeof(unsigned) * 8);
+	BUG_ON(key_length > sizeof(cuckoo_hash_key));
+
+	table_mem = kzalloc(sizeof(cuckoo_hash_entry) * 2 * length, GFP_KERNEL);
+
+	if (table_mem == NULL)
+		return -ENOMEM;
+
+	hashtab->length = length;
+	hashtab->length_bits = length_bits;
+	hashtab->key_length = key_length;
+	hashtab->entries = 0;
+
+	hashtab->table0 = (cuckoo_hash_entry *)table_mem;
+	hashtab->table1 = (cuckoo_hash_entry *)
+		(table_mem + length * sizeof(cuckoo_hash_entry));
+
+	set_hash_parameters(hashtab);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_init);
+
+void cuckoo_hash_destroy(cuckoo_hash_table *hashtab)
+{
+	if (hashtab->table0 != NULL)
+		kfree(hashtab->table0);
+}
+
+EXPORT_SYMBOL_GPL(cuckoo_hash_destroy);
+
+/* 
+ * This computes sizeof(cuckoo_hash) bits of hash, not all will be
+ * necessarily used, but the hash function throws away any that
+ * aren't
+ */ 
+static inline void cuckoo_compute_hash_helper(cuckoo_hash_table *hashtab,
+					      cuckoo_hash_key *a,
+					      cuckoo_hash_key *x,
+					      cuckoo_hash *result) 
+{
+	u64 multiply_result = 0, a_temp, x_temp;
+	u32 carry = 0;
+	u32 *a_words;
+	u32 *x_words;
+	int i;
+
+	/*
+	 * As the mod and div operations in the function effectively
+	 * reduce and shift the bits of the product down to just the
+	 * third word, we need only compute that and return it as a
+	 * result.
+	 *
+	 * Do enough long multiplication to get the word we need
+	 */
+
+	/* This assumes things about the sizes of the key and hash */
+	BUG_ON(hashtab->key_length % sizeof(u32) != 0);
+	BUG_ON(sizeof(cuckoo_hash) != sizeof(u32));
+
+	a_words = (u32 *)a;
+	x_words = (u32 *)x;
+
+	for (i = 0; i < hashtab->key_length / sizeof(u32); i++) {
+		a_temp = a_words[i];
+		x_temp = x_words[i];
+		
+		multiply_result = (a_temp * x_temp) + carry;
+		carry = (multiply_result >> 32) & 0xffffffff;
+	}
+	
+	*result = multiply_result & 0xffffffff;
+}
+
+
+/*
+ * Want to implement (ax mod 2^w) div 2^(w-q) for odd a, 0 < a < 2^w;
+ * w is the length of the key, q is the length of the hash, I think.
+ * See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf 
+ */
+static cuckoo_hash cuckoo_compute_hash(cuckoo_hash_table *hashtab, 
+				       cuckoo_hash_key *key, 
+				       cuckoo_hash_key *a)
+{
+	unsigned q = hashtab->length_bits;
+	unsigned shift = 32 - q;
+	unsigned mask = ((1 << q) - 1) << shift;
+	cuckoo_hash hash;
+
+	cuckoo_compute_hash_helper(hashtab, a, key, &hash);
+
+	/* 
+	 * Take the top few bits to get the right length for this
+	 * hash table 
+	 */
+	hash = (hash & mask) >> shift;
+
+	BUG_ON(hash >= hashtab->length);
+
+	return hash;
+}
+
+
+static int cuckoo_hash_lookup0(cuckoo_hash_table *hashtab,
+			       cuckoo_hash_key *key,
+			       cuckoo_hash_value *value)
+{
+	cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
+
+	if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED)
+	    && cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
+				       key)) {
+		*value = hashtab->table0[hash].value;
+		return 1;
+	}
+
+	return 0;
+}
+
+static int cuckoo_hash_lookup1(cuckoo_hash_table *hashtab,
+			       cuckoo_hash_key *key,
+			       cuckoo_hash_value *value)
+{
+	cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
+
+	if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED)
+	    && cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
+				       key)) {
+		*value = hashtab->table1[hash].value;
+		return 1;
+	}
+
+	return 0;
+}
+
+
+int cuckoo_hash_lookup(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+		       cuckoo_hash_value *value)
+{
+	return cuckoo_hash_lookup0(hashtab, key, value)
+		|| cuckoo_hash_lookup1(hashtab, key, value);
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_lookup);
+
+
+/* Transfer any active entries from "old_table" into hashtab */
+static int cuckoo_hash_transfer_entries(cuckoo_hash_table *hashtab,
+					cuckoo_hash_entry *old_table,
+					unsigned capacity)
+{
+	int i, rc;
+	cuckoo_hash_entry *entry;
+
+	hashtab->entries = 0;
+
+	for (i = 0; i < capacity; i++) {
+		entry = &old_table[i];
+		if (entry->state == CUCKOO_HASH_STATE_OCCUPIED) {
+			rc = cuckoo_hash_add(hashtab, &(entry->key), 
+					     entry->value, 0);
+			if (rc != 0) {
+				return rc;
+			}
+		}
+	}
+  
+	return 0;
+}
+
+
+int cuckoo_hash_rehash(cuckoo_hash_table *hashtab)
+{
+	cuckoo_hash_entry *new_table;
+	cuckoo_hash_table old_hashtab;
+	int resize = 0, rc, rehash_count;
+
+	/*
+	 * Store old tables so we can access the existing values and
+	 * copy across
+	 */
+	memcpy(&old_hashtab, hashtab, sizeof(cuckoo_hash_table));
+
+	/* resize if hashtable is more than half full */
+	if (old_hashtab.entries > old_hashtab.length &&
+	    old_hashtab.length_bits < 32)
+		resize = 1;
+
+ resize:
+	if (resize) {
+		new_table = kmalloc(sizeof(cuckoo_hash_entry) * 4 * hashtab->length,
+				    GFP_ATOMIC);
+		if (new_table == NULL) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		hashtab->length = 2 * hashtab->length;
+		hashtab->length_bits++;
+	} else {
+		new_table = kmalloc(sizeof(cuckoo_hash_entry) * 2 * hashtab->length,
+				    GFP_ATOMIC);
+		if (new_table == NULL) {
+			rc = -ENOMEM;
+			goto err;
+		}
+	}
+    
+	/*
+	 * Point hashtab to new memory region so we can try to
+	 * construct new table
+	 */
+	hashtab->table0 = new_table;
+	hashtab->table1 = (cuckoo_hash_entry *)
+		((char *)new_table + hashtab->length * sizeof(cuckoo_hash_entry));
+  
+	rehash_count = 0;
+
+ again:
+	/* Zero the new tables */
+	memset(new_table, 0, hashtab->length * 2 * sizeof(cuckoo_hash_entry));
+
+	/* Choose new parameters for the hash functions */
+	set_hash_parameters(hashtab);
+
+	/*
+	 * Multiply old_table_length by 2 as the length refers to each
+	 * table, and there are two of them.  This assumes that they
+	 * are arranged sequentially in memory, so assert it 
+	 */
+	BUG_ON(((char *)old_hashtab.table1) != 
+	       ((char *)old_hashtab.table0 + old_hashtab.length
+		* sizeof(cuckoo_hash_entry)));
+	rc = cuckoo_hash_transfer_entries(hashtab, old_hashtab.table0, 
+					  old_hashtab.length * 2);
+	if (rc < 0) {
+		/* Problem */
+		if (rc == -ENOSPC) {
+			++rehash_count;
+			if (rehash_count < CUCKOO_HASH_MAX_LOOP) {
+				/*
+				 * Wanted to rehash, but rather than
+				 * recurse we can just do it here
+				 */
+				goto again;
+			} else {
+				/*
+				 * Didn't manage to rehash, so let's
+				 * go up a size (if we haven't already
+				 * and there's space)
+				 */
+				if (!resize && hashtab->length_bits < 32) {
+					resize = 1;
+					kfree(new_table);
+					goto resize;
+				}
+				else
+					goto err;
+			}
+		}
+		else
+			goto err;
+	}
+
+	/* Success, I think.  Free up the old table */
+	kfree(old_hashtab.table0);
+  
+	/* We should have put all the entries from old table in the new one */
+	BUG_ON(hashtab->entries != old_hashtab.entries);
+
+	return 0;
+ err:
+	EPRINTK("%s: Rehash failed, giving up\n", __FUNCTION__);
+	/* Some other error, give up, at least restore table to how it was */
+	memcpy(hashtab, &old_hashtab, sizeof(cuckoo_hash_table));
+	if (new_table)
+		kfree(new_table);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_rehash);
+
+
+static int 
+cuckoo_hash_insert_or_displace(cuckoo_hash_entry *table, unsigned hash,
+			       cuckoo_hash_key *key, 
+			       cuckoo_hash_value value,
+			       cuckoo_hash_key *displaced_key, 
+			       cuckoo_hash_value *displaced_value)
+{
+	if (table[hash].state == CUCKOO_HASH_STATE_VACANT) {
+		cuckoo_hash_key_set(&(table[hash].key), key);
+		table[hash].value = value;
+		table[hash].state = CUCKOO_HASH_STATE_OCCUPIED;
+
+		return 1;
+	} else {
+		cuckoo_hash_key_set(displaced_key, &(table[hash].key));
+		*displaced_value = table[hash].value;
+		cuckoo_hash_key_set(&(table[hash].key), key);
+		table[hash].value = value;
+
+		return 0;
+	}
+}
+
+
+int cuckoo_hash_add(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+		     cuckoo_hash_value value, int can_rehash)
+{
+	cuckoo_hash hash0, hash1;
+	int i, rc;
+	cuckoo_hash_key key1, key2;
+
+	cuckoo_hash_key_set(&key1, key);
+
+ again:
+	i = 0;
+	do {
+		hash0 = cuckoo_compute_hash(hashtab, &key1, &hashtab->a0);
+		if (cuckoo_hash_insert_or_displace(hashtab->table0, hash0, 
+						   &key1, value, &key2,
+						   &value)) {
+			/* Success */
+			hashtab->entries++;
+			return 0;
+		}
+	
+		hash1 = cuckoo_compute_hash(hashtab, &key2, &hashtab->a1);
+		if (cuckoo_hash_insert_or_displace(hashtab->table1, hash1,
+						   &key2, value, &key1,
+						   &value)) {
+			/* Success */
+			hashtab->entries++;
+			return 0;
+		}
+	} while (++i < CUCKOO_HASH_MAX_LOOP);
+
+	if (can_rehash) {
+		if ((rc = cuckoo_hash_rehash(hashtab)) < 0) {
+			/*
+			 * Give up - this will drop whichever
+			 * key/value pair we have currently displaced
+			 * on the floor
+			 */
+			return rc;
+		}
+		goto again;
+	}
+  
+	EPRINTK("%s: failed hash add\n", __FUNCTION__);
+	/*
+	 * Couldn't do it - bad as we've now removed some random thing
+	 * from the table, and will just drop it on the floor.  Better
+	 * would be to somehow revert the table to the state it was in
+	 * at the start
+	 */
+	return -ENOSPC;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_add);
+
+
+int cuckoo_hash_add_check(cuckoo_hash_table *hashtab,
+			  cuckoo_hash_key *key, cuckoo_hash_value value,
+			  int can_rehash)
+{
+	int stored_value;
+
+	if (cuckoo_hash_lookup(hashtab, key, &stored_value))
+		return -EBUSY;
+
+	return cuckoo_hash_add(hashtab, key, value, can_rehash);
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_add_check);
+
+
+int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key)
+{
+	cuckoo_hash hash;
+
+	hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
+	if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+	    cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
+				    key)) {
+		hashtab->table0[hash].state = CUCKOO_HASH_STATE_VACANT;
+		hashtab->entries--;
+		return 0;
+	}
+  
+	hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
+	if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+	    cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
+				    key)) {
+		hashtab->table1[hash].state = CUCKOO_HASH_STATE_VACANT;
+		hashtab->entries--;
+		return 0;
+	}
+ 
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_remove);
+
+
+int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+		       cuckoo_hash_value value)
+{
+	cuckoo_hash hash;
+
+	hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
+	if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+	    cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
+				    key)) {
+		hashtab->table0[hash].value = value;
+		return 0;
+	}
+
+	hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
+	if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+	    cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
+				    key)) {
+		hashtab->table1[hash].value = value;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_update);
+
+
+void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab)
+{
+	hashtab->iterate_index = 0;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_iterate_reset);
+
+
+int cuckoo_hash_iterate(cuckoo_hash_table *hashtab,
+			cuckoo_hash_key *key, cuckoo_hash_value *value)
+{
+	unsigned index;
+
+	while (hashtab->iterate_index < hashtab->length) {
+		index = hashtab->iterate_index;
+		++hashtab->iterate_index;
+		if (hashtab->table0[index].state == CUCKOO_HASH_STATE_OCCUPIED) {
+			*key = hashtab->table0[index].key;
+			*value = hashtab->table0[index].value;
+			return 0;
+		}
+	}
+
+	while (hashtab->iterate_index >= hashtab->length &&
+	       hashtab->iterate_index < hashtab->length * 2) {
+		index = hashtab->iterate_index - hashtab->length;
+		++hashtab->iterate_index;		
+		if (hashtab->table1[index].state == CUCKOO_HASH_STATE_OCCUPIED) {
+			*key = hashtab->table1[index].key;
+			*value = hashtab->table1[index].value;
+			return 0;
+		}
+	}
+
+	return -ENOSPC;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_iterate);
+
+
+#if 0
+void cuckoo_hash_valid(cuckoo_hash_table *hashtab)
+{
+	int i, entry_count = 0;
+
+	for (i=0; i < hashtab->length; i++) {
+		EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT &&
+			   hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+		if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+			entry_count++;
+		EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT &&
+			   hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+		if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+			entry_count++;	
+	}
+	
+	if (entry_count != hashtab->entries) {
+		EPRINTK("%s: bad count\n", __FUNCTION__);
+		cuckoo_hash_dump(hashtab);
+		return;
+	}
+
+	for (i=0; i< hashtab->length; i++) {
+		if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+			if (i != cuckoo_compute_hash(hashtab, 
+						     &hashtab->table0[i].key, 
+						     &hashtab->a0)) {
+				EPRINTK("%s: Bad key table 0 index %d\n",
+					__FUNCTION__, i);
+				cuckoo_hash_dump(hashtab);
+				return;
+			}
+		if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+			if (i != cuckoo_compute_hash(hashtab, 
+						     &hashtab->table1[i].key, 
+						     &hashtab->a1)) {
+				EPRINTK("%s: Bad key table 1 index %d\n",
+					__FUNCTION__, i);
+				cuckoo_hash_dump(hashtab);
+				return;
+			}
+	}
+
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_valid);
+
+
+void cuckoo_hash_dump(cuckoo_hash_table *hashtab)
+{
+	int i, entry_count;
+
+	entry_count = 0;
+	for (i=0; i < hashtab->length; i++) {
+		EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT &&
+			   hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+		if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+			entry_count++;
+		EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT &&
+			   hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+		if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+			entry_count++;	
+	}
+
+	EPRINTK("======================\n");
+	EPRINTK("Cuckoo hash table dump\n");
+	EPRINTK("======================\n");
+	EPRINTK("length: %d; length_bits: %d; key_length: %d\n", hashtab->length,
+		hashtab->length_bits, hashtab->key_length);
+	EPRINTK("Recorded entries: %d\n", hashtab->entries);
+	EPRINTK("Counted entries: %d\n", entry_count);
+	EPRINTK("a0: %llx; a1: %llx\n", hashtab->a0, hashtab->a1);
+	EPRINTK("-----------------------------------------\n");
+	EPRINTK("Index  Occupied  Key  Value Index0 Index1\n");
+	EPRINTK("-----------------------------------------\n");		
+	for (i=0; i< hashtab->length; i++) {
+		if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+		EPRINTK("%d %d %llx %d %d %d\n", i,
+			hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED,
+			hashtab->table0[i].key, hashtab->table0[i].value,
+			cuckoo_compute_hash(hashtab, &hashtab->table0[i].key, 
+					    &hashtab->a0),
+			cuckoo_compute_hash(hashtab, &hashtab->table0[i].key, 
+					    &hashtab->a1));
+		else
+		EPRINTK("%d %d - - - -\n", i,
+			hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED);
+			
+	}
+	EPRINTK("-----------------------------------------\n");
+	EPRINTK("Index  Occupied  Key  Value Index0 Index1\n");
+	EPRINTK("-----------------------------------------\n");
+	for (i=0; i< hashtab->length; i++) {
+		if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+		EPRINTK("%d %d %llx %d %d %d\n", i,
+			hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED,
+			hashtab->table1[i].key, hashtab->table1[i].value,
+			cuckoo_compute_hash(hashtab, &hashtab->table1[i].key, 
+					    &hashtab->a0),
+			cuckoo_compute_hash(hashtab, &hashtab->table1[i].key, 
+					    &hashtab->a1));
+		else
+		EPRINTK("%d %d - - - -\n", i,
+			hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED);
+	} 
+	EPRINTK("======================\n");
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_dump);
+#endif
diff --git a/drivers/xen/sfc_netutil/accel_cuckoo_hash.h b/drivers/xen/sfc_netutil/accel_cuckoo_hash.h
new file mode 100644
index 0000000..83518f9
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_cuckoo_hash.h
@@ -0,0 +1,227 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * A cuckoo hash table consists of two sub tables.  Each entry can
+ * hash to a position in each table.  If, on entry, its position is
+ * found to be occupied, the existing element is moved to it's other
+ * location.  This recurses until success or a loop is found.  If a
+ * loop is found the table is rehashed.
+ *
+ *  See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf
+ */
+
+#ifndef NET_ACCEL_CUCKOO_HASH_H
+#define NET_ACCEL_CUCKOO_HASH_H
+
+/*! Type used for hash table keys of ip pairs */
+typedef struct {
+	u32 local_ip;
+	//u32 remote_ip;
+	u16 local_port;
+	//u16 remote_port;
+	/* Technically only 1 bit, but use 16 to make key a round
+	   number size */
+	u16 proto;
+} cuckoo_hash_ip_key;
+
+/*! Type used for hash table keys of mac addresses */
+typedef u64 cuckoo_hash_mac_key;
+
+/*! This type is designed to be large enough to hold all supported key
+ *  sizes to avoid having to malloc storage for them.
+ */
+typedef u64 cuckoo_hash_key;
+
+/*! Type used for the values stored in the hash table */
+typedef int cuckoo_hash_value;
+
+/*! Type used for the hash used to index the table */
+typedef u32 cuckoo_hash;
+
+/*! How long to spend displacing values when adding before giving up
+ *  and rehashing */
+#define CUCKOO_HASH_MAX_LOOP (hashtab->length)
+
+/*! State of hash table entry */
+typedef enum {
+	CUCKOO_HASH_STATE_VACANT = 0,
+	CUCKOO_HASH_STATE_OCCUPIED 
+} cuckoo_hash_state;
+
+/*! An entry in the hash table */
+typedef struct {
+	cuckoo_hash_state state;
+	cuckoo_hash_key key;
+	cuckoo_hash_value value;
+} cuckoo_hash_entry;
+
+/*! A cuckoo hash table */
+typedef struct {
+	/*! The length of each table (NB. there are two tables of this
+	 *  length) */
+	unsigned length; 
+	/*! The length of each table in bits */
+	unsigned length_bits;
+	/*! The length of the key in bytes */ 
+	unsigned key_length; 
+	/*! The number of entries currently stored in the table */
+	unsigned entries;
+	/*! Index into table used by cuckoo_hash_iterate */
+	unsigned iterate_index; 
+
+	/* parameter of hash functions */
+	/*! The "a" parameter of the first hash function */
+	cuckoo_hash_key a0; 
+	/*! The "a" parameter of the second hash function */
+	cuckoo_hash_key a1; 
+
+	/*! The first table */
+	cuckoo_hash_entry *table0; 
+	/*! The second table */
+	cuckoo_hash_entry *table1; 
+} cuckoo_hash_table;
+
+/*! Initialise the cuckoo has table 
+ *
+ * \param hashtab A pointer to an unitialised hash table structure
+ * \param length_bits The number of elements in each table equals
+ * 2**length_bits
+ * \param key_length The length of the key in bytes
+ *
+ * \return 0 on success, -ENOMEM if it couldn't allocate the tables
+ */
+extern
+int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits,
+		     unsigned key_length);
+
+
+/*! Destroy a hash table
+ *
+ * \param hashtab A hash table that has previously been passed to a
+ * successful call of cuckoo_hash_init()
+ */
+extern
+void cuckoo_hash_destroy(cuckoo_hash_table *hashtab);
+
+
+/*! Lookup an entry in the hash table 
+ *
+ * \param hashtab The hash table in which to look.
+ * \param key Pointer to a mac address to use as the key
+ * \param value On exit set to the value stored if key was present
+ *
+ * \return 0 if not present in the table, non-zero if it is (and value
+ * is set accordingly)
+ */
+extern
+int cuckoo_hash_lookup(cuckoo_hash_table *hashtab,
+		       cuckoo_hash_key *key,
+		       cuckoo_hash_value *value);
+
+/*! Add an entry to the hash table.  Key must not be a duplicate of
+ * anything already in the table.  If this is a risk, see
+ * cuckoo_hash_add_check
+ *
+ * \param hashtab The hash table to add the entry to
+ * \param key Pointer to a mac address to use as a key
+ * \param value The value to store 
+ * \param can_rehash Flag to allow the add function to rehash the
+ * table if necessary
+ *
+ * \return 0 on success, non-zero on failure.  -ENOSPC means it just
+ * couldn't find anywhere to put it - this is bad and probably means
+ * an entry has been dropped on the floor (but the entry you just
+ * tried to add may now be included)
+ */
+extern
+int cuckoo_hash_add(cuckoo_hash_table *hashtab,
+		    cuckoo_hash_key *key, 
+		    cuckoo_hash_value value,
+		    int can_rehash);
+
+/*! Same as cuckoo_hash_add but first checks to ensure entry is not
+ * already there
+ * \return -EBUSY if already there
+ */
+
+extern
+int cuckoo_hash_add_check(cuckoo_hash_table *hashtab,
+			  cuckoo_hash_key *key, 
+			  cuckoo_hash_value value,
+			  int can_rehash);
+/*! Remove an entry from the table 
+ *
+ * \param hashtab The hash table to remove the entry from
+ * \param key The key that was used to previously add the entry
+ *
+ * \return 0 on success, -EINVAL if the entry couldn't be found 
+ */
+extern
+int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key);
+
+
+/*! Helper for those using mac addresses to convert to a key for the
+ *  hash table
+ */
+static inline cuckoo_hash_mac_key cuckoo_mac_to_key(const u8 *mac)
+{
+	return (cuckoo_hash_mac_key)(mac[0])
+		| (cuckoo_hash_mac_key)(mac[1]) << 8
+		| (cuckoo_hash_mac_key)(mac[2]) << 16
+		| (cuckoo_hash_mac_key)(mac[3]) << 24
+		| (cuckoo_hash_mac_key)(mac[4]) << 32
+		| (cuckoo_hash_mac_key)(mac[5]) << 40;
+}
+
+
+/*! Update an entry already in the hash table to take a new value 
+ *
+ * \param hashtab The hash table to add the entry to
+ * \param key Pointer to a mac address to use as a key
+ * \param value The value to store 
+ *
+ * \return 0 on success, non-zero on failure. 
+ */
+int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+		       cuckoo_hash_value value);
+
+
+/*! Go through the hash table and return all used entries (one per call)
+ *
+ * \param hashtab The hash table to iterate over 
+ * \param key Pointer to a key to take the returned key
+ * \param value Pointer to a value to take the returned value
+ *
+ * \return 0 on success (key, value set), non-zero on failure.
+ */
+int cuckoo_hash_iterate(cuckoo_hash_table *hashtab,
+			cuckoo_hash_key *key, cuckoo_hash_value *value);
+void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab);
+
+/* debug, not compiled by default */
+void cuckoo_hash_valid(cuckoo_hash_table *hashtab);
+void cuckoo_hash_dump(cuckoo_hash_table *hashtab);
+
+#endif /* NET_ACCEL_CUCKOO_HASH_H */
diff --git a/drivers/xen/sfc_netutil/accel_msg_iface.c b/drivers/xen/sfc_netutil/accel_msg_iface.c
new file mode 100644
index 0000000..e52de14
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_msg_iface.c
@@ -0,0 +1,301 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <xen/evtchn.h>
+
+#include "accel_util.h"
+#include "accel_msg_iface.h"
+
+#define NET_ACCEL_MSG_Q_SIZE (1024)
+#define NET_ACCEL_MSG_Q_MASK (NET_ACCEL_MSG_Q_SIZE - 1)
+
+#ifdef NDEBUG
+#define NET_ACCEL_CHECK_MAGIC(_p, _errval)
+#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id)
+#else
+#define NET_ACCEL_CHECK_MAGIC(_p, _errval)				\
+	if (_p->magic != NET_ACCEL_MSG_MAGIC) {				\
+		pr_err("%s: passed invalid shared page %p!\n",		\
+		       __FUNCTION__, _p);				\
+		return _errval;						\
+	}
+#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id)				\
+	printk(_t ": queue %d write %x read %x base %x limit %x\n",     \
+	       _id, _q->write, _q->read, _q->base, _q->limit);
+#endif
+
+/*
+ * We've been passed at least 2 pages. 1 control page and 1 or more
+ * data pages.
+ */
+int net_accel_msg_init_page(void *mem, int len, int up)
+{
+	struct net_accel_shared_page *shared_page = 
+		(struct net_accel_shared_page*)mem;
+
+	if ((unsigned long)shared_page & NET_ACCEL_MSG_Q_MASK)
+		return -EINVAL;
+
+	shared_page->magic = NET_ACCEL_MSG_MAGIC;
+
+	shared_page->aflags = 0;
+
+	shared_page->net_dev_up = up;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_init_page);
+
+
+void net_accel_msg_init_queue(sh_msg_fifo2 *queue,
+			      struct net_accel_msg_queue *indices,
+			      struct net_accel_msg *base, int size)
+{
+	queue->fifo = base;
+	spin_lock_init(&queue->lock);
+	sh_fifo2_init(queue, size-1, &indices->read, &indices->write);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_init_queue);
+
+
+static inline int _net_accel_msg_send(struct net_accel_shared_page *sp,
+				      sh_msg_fifo2 *queue,
+				      struct net_accel_msg *msg,
+				      int is_reply)
+{
+	int rc = 0;
+	NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+	rmb();
+	if (is_reply) {
+		EPRINTK_ON(sh_fifo2_is_full(queue));
+		sh_fifo2_put(queue, *msg);
+	} else {
+		if (sh_fifo2_not_half_full(queue)) {
+			sh_fifo2_put(queue, *msg);
+		} else {
+			rc = -ENOSPC;
+		}
+	}
+	wmb();
+	return rc;
+}
+
+/* Notify after a batch of messages have been sent */
+void net_accel_msg_notify(int irq)
+{
+	notify_remote_via_irq(irq);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_notify);
+
+/* 
+ * Send a message on the specified FIFO. Returns 0 on success, -errno
+ * on failure. The message in msg is copied to the current slot of the
+ * FIFO.
+ */
+int net_accel_msg_send(struct net_accel_shared_page *sp, sh_msg_fifo2 *q, 
+		       struct net_accel_msg *msg)
+{
+	unsigned long flags;
+	int rc;
+	net_accel_msg_lock_queue(q, &flags);
+	rc = _net_accel_msg_send(sp, q, msg, 0);
+	net_accel_msg_unlock_queue(q, &flags);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_send);
+
+
+/* As net_accel_msg_send but also posts a notification to the far end. */
+int net_accel_msg_send_notify(struct net_accel_shared_page *sp, int irq, 
+			      sh_msg_fifo2 *q, struct net_accel_msg *msg)
+{
+	unsigned long flags;
+	int rc;
+	net_accel_msg_lock_queue(q, &flags);
+	rc = _net_accel_msg_send(sp, q, msg, 0);
+	net_accel_msg_unlock_queue(q, &flags);
+	if (rc >= 0)
+		notify_remote_via_irq(irq);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_send_notify);
+
+
+int net_accel_msg_reply(struct net_accel_shared_page *sp, sh_msg_fifo2 *q, 
+		       struct net_accel_msg *msg)
+{
+	unsigned long flags;
+	int rc;
+	net_accel_msg_lock_queue(q, &flags);
+	rc = _net_accel_msg_send(sp, q, msg, 1);
+	net_accel_msg_unlock_queue(q, &flags);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_reply);
+
+
+/* As net_accel_msg_send but also posts a notification to the far end. */
+int net_accel_msg_reply_notify(struct net_accel_shared_page *sp, int irq, 
+			      sh_msg_fifo2 *q, struct net_accel_msg *msg)
+{
+	unsigned long flags;
+	int rc;
+	net_accel_msg_lock_queue(q, &flags);
+	rc = _net_accel_msg_send(sp, q, msg, 1);
+	net_accel_msg_unlock_queue(q, &flags);
+	if (rc >= 0)
+		notify_remote_via_irq(irq);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_reply_notify);
+
+
+/*
+ * Look at a received message, if any, so a decision can be made about
+ * whether to read it now or not.  Cookie is a bit of debug which is
+ * set here and checked when passed to net_accel_msg_recv_next()
+ */
+int net_accel_msg_peek(struct net_accel_shared_page *sp, 
+		       sh_msg_fifo2 *queue, 
+		       struct net_accel_msg *msg, int *cookie)
+{
+	unsigned long flags;
+	int rc = 0;
+	NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+	net_accel_msg_lock_queue(queue, &flags);
+	rmb();
+	if (sh_fifo2_is_empty(queue)) {
+		rc = -ENOENT;
+	} else {
+		*msg = sh_fifo2_peek(queue);
+		*cookie = *(queue->fifo_rd_i);
+	}
+	net_accel_msg_unlock_queue(queue, &flags);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_peek);
+
+
+/*
+ * Move the queue onto the next element, used after finished with a
+ * peeked msg 
+ */
+int net_accel_msg_recv_next(struct net_accel_shared_page *sp, 
+			    sh_msg_fifo2 *queue, int cookie)
+{
+	unsigned long flags;
+	NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+	net_accel_msg_lock_queue(queue, &flags);
+	rmb();
+	/* Mustn't be empty */
+	BUG_ON(sh_fifo2_is_empty(queue));
+	/* 
+	 * Check cookie matches, i.e. we're advancing over the same message
+	 * as was got using peek 
+	 */
+	BUG_ON(cookie != *(queue->fifo_rd_i));
+	sh_fifo2_rd_next(queue);
+	wmb();
+	net_accel_msg_unlock_queue(queue, &flags);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_recv_next);
+
+
+/* 
+ * Receive a message on the specified FIFO. Returns 0 on success,
+ * -errno on failure.
+ */
+int net_accel_msg_recv(struct net_accel_shared_page *sp, sh_msg_fifo2 *queue, 
+		       struct net_accel_msg *msg)
+{
+	unsigned long flags;
+	int rc = 0;
+	NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+	net_accel_msg_lock_queue(queue, &flags);
+	rmb();
+	if (sh_fifo2_is_empty(queue)) {
+		rc = -ENOENT;
+	} else {
+		sh_fifo2_get(queue, msg);
+	}
+	wmb();
+	net_accel_msg_unlock_queue(queue, &flags);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_recv);
+
+
+/* 
+ * Start sending a message without copying. returns a pointer to a message
+ * that will be filled out in place. The queue is locked until the message 
+ * is sent.
+ */
+struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp,
+					       sh_msg_fifo2 *queue, unsigned long *flags)
+{
+	struct net_accel_msg *msg;
+	NET_ACCEL_CHECK_MAGIC(sp, NULL);
+	net_accel_msg_lock_queue(queue, flags);
+	rmb();
+	if (sh_fifo2_not_half_full(queue)) {
+		msg = sh_fifo2_pokep(queue);
+	} else {
+		net_accel_msg_unlock_queue(queue, flags);
+		msg = NULL;
+	}
+	return msg;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_start_send);
+
+
+static inline void _msg_complete(struct net_accel_shared_page *sp,
+				 sh_msg_fifo2 *queue,
+				 unsigned long *flags)
+{
+	sh_fifo2_wr_next(queue);
+	net_accel_msg_unlock_queue(queue, flags);
+}
+
+/*
+ * Complete the sending of a message started with net_accel_msg_start_send. The 
+ * message is implicit since the queue was locked by _start
+ */
+void net_accel_msg_complete_send(struct net_accel_shared_page *sp,
+				 sh_msg_fifo2 *queue,
+				 unsigned long *flags)
+{
+	_msg_complete(sp, queue, flags);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_complete_send);
+
+/* As net_accel_msg_complete_send but does the notify. */
+void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp, 
+					sh_msg_fifo2 *queue, 
+					unsigned long *flags, int irq)
+{
+	_msg_complete(sp, queue, flags);
+	notify_remote_via_irq(irq);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_complete_send_notify);
diff --git a/drivers/xen/sfc_netutil/accel_msg_iface.h b/drivers/xen/sfc_netutil/accel_msg_iface.h
new file mode 100644
index 0000000..0483a56
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_msg_iface.h
@@ -0,0 +1,415 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NET_ACCEL_MSG_IFACE_H
+#define NET_ACCEL_MSG_IFACE_H
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include "accel_shared_fifo.h"
+
+#define NET_ACCEL_MSG_MAGIC (0x85465479)
+
+/*! We talk version 0.010 of the interdomain protocol */
+#define NET_ACCEL_MSG_VERSION (0x00001000)
+
+/*! Shared memory portion of inter-domain FIFO */
+struct net_accel_msg_queue {
+	u32 read;
+	u32 write;
+};
+
+
+/*
+ * The aflags in the following structure is used as follows:
+ *
+ *  - each bit is set when one of the corresponding variables is
+ *  changed by either end.
+ *
+ *  - the end that has made the change then forwards an IRQ to the
+ *  other
+ *
+ *  - the IRQ handler deals with these bits either on the fast path, or
+ *  for less common changes, by jumping onto the slow path.
+ *
+ *  - once it has seen a change, it clears the relevant bit.
+ *
+ * aflags is accessed atomically using clear_bit, test_bit,
+ * test_and_set_bit etc
+ */
+
+/*
+ * The following used to signify to the other domain when the queue
+ * they want to use is full, and when it is no longer full.  Could be
+ * compressed to use fewer bits but done this way for simplicity and
+ * clarity
+ */
+
+/* "dom0->domU queue" is full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL      0x1 
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B    0
+/* "dom0->domU queue" is not full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL   0x2 
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B 1
+/* "domU->dom0 queue" is full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL      0x4 
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B    2
+/* "domU->dom0 queue" is not full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL   0x8
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B 3
+/* dom0 -> domU net_dev up/down events */
+#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN	 0x10
+#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B       4
+
+/*
+ * Masks used to test if there are any messages for domU and dom0
+ * respectively
+ */
+#define NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK	\
+	(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL    |	\
+	 NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL |	\
+	 NET_ACCEL_MSG_AFLAGS_NETUPDOWN)
+#define NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK	\
+	(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL |	\
+	 NET_ACCEL_MSG_AFLAGS_QUEUEUFULL)
+
+/*! The shared data structure used for inter-VM communication. */
+struct net_accel_shared_page {
+	/*! Sanity check */
+	u32 magic;	    
+	/*! Used by host/Dom0 */
+	struct net_accel_msg_queue queue0;
+	/*! Used by guest/DomU */
+	struct net_accel_msg_queue queue1;
+	/*! Atomic flags, used to communicate simple state changes */
+	u32 aflags;     
+	/*! State of net_dev used for acceleration */     
+	u32 net_dev_up; 
+};
+
+
+enum net_accel_hw_type {
+	/*! Not a virtualisable NIC: use slow path. */
+	NET_ACCEL_MSG_HWTYPE_NONE = 0,
+	/*! NIC is Falcon-based */
+	NET_ACCEL_MSG_HWTYPE_FALCON_A = 1,
+	NET_ACCEL_MSG_HWTYPE_FALCON_B = 2,
+	NET_ACCEL_MSG_HWTYPE_SIENA_A = 3,
+};
+
+/*! The maximum number of pages used by an event queue. */
+#define EF_HW_FALCON_EVQ_PAGES 8
+
+struct net_accel_hw_falcon_b {
+	/* VI */
+	/*! Grant for Tx DMA Q */
+	u32 txdmaq_gnt;   
+	/*! Grant for Rx DMA Q */
+	u32 rxdmaq_gnt;   
+	/*! Machine frame number for Tx/Rx doorbell page */
+	u32 doorbell_mfn; 
+	/*! Grant for Tx/Rx doorbell page */
+	u32 doorbell_gnt;
+
+	/* Event Q */
+	/*! Grants for the pages of the EVQ */
+	u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES]; 
+	u32 evq_offs;
+	/*! log2(pages in event Q) */
+	u32 evq_order;    
+	/*! Capacity in events */
+	u32 evq_capacity; 
+	/*! Eventq pointer register physical address */
+	u32 evq_rptr; 
+	/*! Interface instance */
+	u32 instance; 
+	/*! Capacity of RX queue */
+	u32 rx_capacity;
+	/*! Capacity of TX queue */
+	u32 tx_capacity;
+
+	/* NIC */
+	s32 nic_arch;
+	s32 nic_revision;
+	u8 nic_variant;
+};
+
+struct net_accel_hw_falcon_a {
+	struct net_accel_hw_falcon_b common;
+	u32 evq_rptr_gnt;
+};
+
+
+/*! Description of the hardware that the DomU is being given. */
+struct net_accel_msg_hw {
+	u32 type;		/*!< Hardware type */
+	union {
+		struct net_accel_hw_falcon_a falcon_a;
+		struct net_accel_hw_falcon_b falcon_b;
+	} resources;
+};
+
+/*! Start-of-day handshake message. Dom0 fills in its version and
+ * sends, DomU checks, inserts its version and replies
+ */
+struct net_accel_msg_hello {
+	/*! Sender's version (set by each side in turn) */
+	u32 version;	
+	/*! max pages allocated/allowed for buffers */
+	u32 max_pages;      
+};
+
+/*! Maximum number of page requests that can fit in a message. */
+#define NET_ACCEL_MSG_MAX_PAGE_REQ (8)
+
+/*! Request for NIC buffers. DomU fils out pages and grants (and
+ *  optionally) reqid, dom0 fills out buf and sends reply 
+ */
+struct net_accel_msg_map_buffers {
+	u32 reqid;	/*!< Optional request ID */
+	u32 pages;	/*!< Number of pages to map */
+	u32 grants[NET_ACCEL_MSG_MAX_PAGE_REQ];  /*!< Grant ids to map */ 
+	u32 buf;	  /*!< NIC buffer address of pages obtained */
+};
+
+/*! Notification of a change to local mac address, used to filter
+  locally destined packets off the fast path */
+struct net_accel_msg_localmac {
+	u32 flags;	/*!< Should this be added or removed? */
+	u8 mac[ETH_ALEN]; /*!< The mac address to filter onto slow path */
+};
+
+struct net_accel_msg_fastpath {
+	u32 flags;	/*!< Should this be added or removed? */
+	u8  mac[ETH_ALEN];/*!< The mac address to filter onto fast path */
+	u16 port;	 /*!< The port of the connection */
+	u32 ip;	   /*!< The IP address of the connection */
+	u8  proto;	/*!< The protocol of connection (TCP/UDP) */
+};
+
+/*! Values for struct ef_msg_localmac/fastpath.flags */
+#define NET_ACCEL_MSG_ADD    0x1
+#define NET_ACCEL_MSG_REMOVE 0x2
+
+/*! Overall message structure */
+struct net_accel_msg {
+	/*! ID specifying type of messge */
+	u32 id;		     
+	union {
+		/*! handshake */
+		struct net_accel_msg_hello hello;  
+		/*! hardware description */
+		struct net_accel_msg_hw hw;	
+		/*! buffer map request */
+		struct net_accel_msg_map_buffers mapbufs; 
+		/*! mac address of a local interface */
+		struct net_accel_msg_localmac localmac; 
+		/*! address of a new fastpath connection */
+		struct net_accel_msg_fastpath fastpath; 
+		/*! make the message a fixed size */
+		u8 pad[128 - sizeof(u32)]; 
+	}  u;
+};
+
+
+#define NET_ACCEL_MSG_HW_TO_MSG(_u) container_of(_u, struct net_accel_msg, u.hw)
+
+/*! Inter-domain message FIFO */
+typedef struct {
+	struct net_accel_msg *fifo;
+	u32 fifo_mask;
+	u32 *fifo_rd_i;
+	u32 *fifo_wr_i;
+	spinlock_t lock;
+	u32 is_locked; /* Debug flag */
+} sh_msg_fifo2;
+
+
+#define NET_ACCEL_MSG_OFFSET_MASK PAGE_MASK
+
+/* Modifiers */
+#define NET_ACCEL_MSG_REPLY    (0x80000000)
+#define NET_ACCEL_MSG_ERROR    (0x40000000)
+
+/* Dom0 -> DomU and reply. Handshake/version check. */
+#define NET_ACCEL_MSG_HELLO    (0x00000001)
+/* Dom0 -> DomU : hardware setup (VI info.) */
+#define NET_ACCEL_MSG_SETHW    (0x00000002)
+/*
+ * Dom0 -> DomU. Notification of a local mac to add/remove from slow
+ * path filter
+ */
+#define NET_ACCEL_MSG_LOCALMAC (0x00000003)
+/* 
+ * DomU -> Dom0 and reply. Request for buffer table entries for
+ * preallocated pages.
+ */
+#define NET_ACCEL_MSG_MAPBUF   (0x00000004)
+/* 
+ * Dom0 -> DomU. Notification of a local mac to add/remove from fast
+ * path filter
+ */
+#define NET_ACCEL_MSG_FASTPATH (0x00000005)
+
+/*! Initialise a message and set the type
+ * \param message : the message
+ * \param code : the message type 
+ */
+static inline void net_accel_msg_init(struct net_accel_msg *msg, int code) {
+	msg->id = (u32)code;
+}
+
+/*! initialise a shared page structure
+ * \param shared_page : mapped memory in which the structure resides
+ * \param len : size of the message FIFO area that follows
+ * \param up : initial up/down state of netdev 
+ * \return 0 or an error code
+ */
+extern int net_accel_msg_init_page(void *shared_page, int len, int up);
+
+/*! initialise a message queue 
+ * \param queue : the message FIFO to initialise 
+ * \param indices : the read and write indices in shared memory
+ * \param base : the start of the memory area for the FIFO
+ * \param size : the size of the FIFO in bytes
+ */
+extern void net_accel_msg_init_queue(sh_msg_fifo2 *queue,
+				     struct net_accel_msg_queue *indices,
+				     struct net_accel_msg *base, int size);
+
+/* Notify after a batch of messages have been sent */
+extern void net_accel_msg_notify(int irq);
+
+/*! Send a message on the specified FIFO. The message is copied to the 
+ *  current slot of the FIFO.
+ * \param sp : pointer to shared page
+ * \param q : pointer to message FIFO to use
+ * \param msg : pointer to message 
+ * \return 0 on success, -errno on
+ */ 
+extern int net_accel_msg_send(struct net_accel_shared_page *sp,
+			      sh_msg_fifo2 *q, 
+			      struct net_accel_msg *msg);
+extern int net_accel_msg_reply(struct net_accel_shared_page *sp,
+			      sh_msg_fifo2 *q, 
+			      struct net_accel_msg *msg);
+
+/*! As net_accel_msg_send but also posts a notification to the far end. */
+extern int net_accel_msg_send_notify(struct net_accel_shared_page *sp, 
+				     int irq, sh_msg_fifo2 *q, 
+				     struct net_accel_msg *msg);
+/*! As net_accel_msg_send but also posts a notification to the far end. */
+extern int net_accel_msg_reply_notify(struct net_accel_shared_page *sp, 
+				      int irq, sh_msg_fifo2 *q, 
+				      struct net_accel_msg *msg);
+
+/*! Receive a message on the specified FIFO. Returns 0 on success,
+ *  -errno on failure.
+ */
+extern int net_accel_msg_recv(struct net_accel_shared_page *sp,
+			      sh_msg_fifo2 *q,
+			      struct net_accel_msg *msg);
+
+/*! Look at a received message, if any, so a decision can be made
+ *  about whether to read it now or not.  Cookie is a bit of debug
+ *  which is set here and checked when passed to
+ *  net_accel_msg_recv_next()
+ */
+extern int net_accel_msg_peek(struct net_accel_shared_page *sp,
+			      sh_msg_fifo2 *queue, 
+			      struct net_accel_msg *msg, int *cookie);
+/*! Move the queue onto the next element, used after finished with a
+ *  peeked msg 
+ */
+extern int net_accel_msg_recv_next(struct net_accel_shared_page *sp,
+				   sh_msg_fifo2 *queue, int cookie);
+
+/*! Start sending a message without copying. returns a pointer to a
+ *  message that will be filled out in place. The queue is locked
+ *  until the message is sent.
+ */
+extern 
+struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp,
+					       sh_msg_fifo2 *queue,
+					       unsigned long *flags);
+
+
+/*! Complete the sending of a message started with
+ *  net_accel_msg_start_send. The message is implicit since the queue
+ *  was locked by _start 
+ */
+extern void net_accel_msg_complete_send(struct net_accel_shared_page *sp,
+					sh_msg_fifo2 *queue,
+					unsigned long *flags);
+
+/*! As net_accel_msg_complete_send but does the notify. */
+extern void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp, 
+					       sh_msg_fifo2 *queue,
+					       unsigned long *flags, int irq);
+
+/*! Lock the queue so that multiple "_locked" functions can be called
+ *  without the queue being modified by others 
+ */
+static inline
+void net_accel_msg_lock_queue(sh_msg_fifo2 *queue, unsigned long *flags)
+{
+	spin_lock_irqsave(&queue->lock, (*flags));
+	rmb();
+	BUG_ON(queue->is_locked);
+	queue->is_locked = 1;
+}
+
+/*! Unlock the queue */
+static inline
+void net_accel_msg_unlock_queue(sh_msg_fifo2 *queue, unsigned long *flags)
+{
+	BUG_ON(!queue->is_locked);
+	queue->is_locked = 0;
+	wmb();
+	spin_unlock_irqrestore(&queue->lock, (*flags));
+}
+
+/*! Give up without sending a message that was started with
+ *  net_accel_msg_start_send() 
+ */
+static inline 
+void net_accel_msg_abort_send(struct net_accel_shared_page *sp,
+			      sh_msg_fifo2 *queue, unsigned long *flags)
+{
+	net_accel_msg_unlock_queue(queue, flags);
+}
+
+/*! Test the queue to ensure there is sufficient space */
+static inline
+int net_accel_msg_check_space(sh_msg_fifo2 *queue, unsigned space)
+{
+	return sh_fifo2_space(queue) >= space;
+}
+
+#endif /* NET_ACCEL_MSG_IFACE_H */
diff --git a/drivers/xen/sfc_netutil/accel_shared_fifo.h b/drivers/xen/sfc_netutil/accel_shared_fifo.h
new file mode 100644
index 0000000..a55608a
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_shared_fifo.h
@@ -0,0 +1,127 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NET_ACCEL_SHARED_FIFO_H
+#define NET_ACCEL_SHARED_FIFO_H
+
+/*
+ * This is based on fifo.h, but handles sharing between address spaces
+ * that don't trust each other, by splitting out the read and write
+ * indices. This costs at least one pointer indirection more than the
+ * vanilla version per access.
+ */
+
+typedef struct {
+	char*	 fifo;
+	unsigned      fifo_mask;
+	unsigned      *fifo_rd_i;
+	unsigned      *fifo_wr_i;
+} sh_byte_fifo2;
+
+#define SH_FIFO2_M(f, x)     ((x) & ((f)->fifo_mask))
+
+static inline unsigned log2_ge(unsigned long n, unsigned min_order) {
+	unsigned order = min_order;
+	while((1ul << order) < n) ++order;
+	return order;
+}
+
+static inline unsigned long pow2(unsigned order) {
+	return (1ul << order);
+}
+
+#define is_pow2(x)  (pow2(log2_ge((x), 0)) == (x))
+
+#define sh_fifo2_valid(f)  ((f) && (f)->fifo && (f)->fifo_mask > 0 &&   \
+			    is_pow2((f)->fifo_mask+1u))
+
+#define sh_fifo2_init(f, cap, _rptr, _wptr)		\
+	do {						\
+		BUG_ON(!is_pow2((cap) + 1));		\
+		(f)->fifo_rd_i = _rptr;			\
+		(f)->fifo_wr_i = _wptr;			\
+		*(f)->fifo_rd_i = *(f)->fifo_wr_i = 0u; \
+		(f)->fifo_mask = (cap);			\
+	} while(0)
+
+#define sh_fifo2_num(f)      SH_FIFO2_M((f),*(f)->fifo_wr_i - *(f)->fifo_rd_i)
+#define sh_fifo2_space(f)    SH_FIFO2_M((f),*(f)->fifo_rd_i - *(f)->fifo_wr_i-1u)
+#define sh_fifo2_is_empty(f)  (sh_fifo2_num(f)==0)
+#define sh_fifo2_not_empty(f) (sh_fifo2_num(f)!=0)
+#define sh_fifo2_is_full(f)   (sh_fifo2_space(f)==0u)
+#define sh_fifo2_not_full(f)  (sh_fifo2_space(f)!=0u)
+#define sh_fifo2_buf_size(f) ((f)->fifo_mask + 1u)
+#define sh_fifo2_capacity(f) ((f)->fifo_mask)
+#define sh_fifo2_end(f)      ((f)->fifo + sh_fifo2_buf_size(f))
+#define sh_fifo2_not_half_full(f) (sh_fifo2_space(f) > (sh_fifo2_capacity(f) >> 1))
+
+#define sh_fifo2_peek(f)     ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i)])
+#define sh_fifo2_peekp(f)    ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_rd_i))
+#define sh_fifo2_poke(f)     ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i)])
+#define sh_fifo2_pokep(f)    ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_wr_i))
+#define sh_fifo2_peek_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i+(i))])
+#define sh_fifo2_poke_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i+(i))])
+
+#define sh_fifo2_rd_next(f)					\
+	do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + 1u;} while(0)
+#define sh_fifo2_wr_next(f)					\
+	do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + 1u;} while(0)
+#define sh_fifo2_rd_adv(f, n)					\
+	do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + (n);} while(0)
+#define sh_fifo2_wr_adv(f, n)					\
+	do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + (n);} while(0)
+
+#define sh_fifo2_put(f, v)						\
+	do {sh_fifo2_poke(f) = (v); wmb(); sh_fifo2_wr_next(f);} while(0)
+
+#define sh_fifo2_get(f, pv)						\
+	do {*(pv) = sh_fifo2_peek(f); mb(); sh_fifo2_rd_next(f);} while(0)
+
+static inline unsigned sh_fifo2_contig_num(sh_byte_fifo2 *f)
+{
+	unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i);
+	unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i);
+
+	return (fifo_wr_i >= fifo_rd_i)
+		? fifo_wr_i - fifo_rd_i
+		: f->fifo_mask + 1u - *(f)->fifo_rd_i;
+}
+
+static inline unsigned sh_fifo2_contig_space(sh_byte_fifo2 *f)
+{
+	unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i);
+	unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i);
+
+	return (fifo_rd_i > fifo_wr_i)
+		? fifo_rd_i - fifo_wr_i - 1
+		: (f->fifo_mask + 1u - fifo_wr_i
+		   /*
+		    * The last byte can't be used if the read pointer
+		    * is at zero.
+		    */
+		   - (fifo_rd_i==0));
+}
+
+
+#endif /* NET_ACCEL_SHARED_FIFO_H */
diff --git a/drivers/xen/sfc_netutil/accel_util.c b/drivers/xen/sfc_netutil/accel_util.c
new file mode 100644
index 0000000..1db241d
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_util.c
@@ -0,0 +1,336 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/slab.h>
+#include <linux/if_ether.h>
+#include <linux/module.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/hypercall.h>
+#include <xen/xenbus.h>
+#include <xen/gnttab.h>
+
+#include "accel_util.h"
+
+#ifdef EFX_GCOV
+#include "gcov.h"
+
+static int __init net_accel_init(void)
+{
+	gcov_provider_init(THIS_MODULE);
+	return 0;
+}
+module_init(net_accel_init);
+
+static void __exit net_accel_exit(void)
+{
+	gcov_provider_fini(THIS_MODULE);
+}
+module_exit(net_accel_exit);
+#endif
+
+/* Shutdown remote domain that is misbehaving */
+int net_accel_shutdown_remote(int domain)
+{
+	struct sched_remote_shutdown sched_shutdown = {
+		.domain_id = domain,
+		.reason = SHUTDOWN_crash
+	};
+
+	EPRINTK("Crashing domain %d\n", domain);
+
+	return HYPERVISOR_sched_op(SCHEDOP_remote_shutdown, &sched_shutdown);
+}
+EXPORT_SYMBOL(net_accel_shutdown_remote);
+
+
+/* Based on xenbus_backend_client.c:xenbus_map_ring() */
+static int net_accel_map_grant(struct xenbus_device *dev, int gnt_ref,
+			       grant_handle_t *handle, void *vaddr, 
+			       u64 *dev_bus_addr, unsigned flags)
+{
+	struct gnttab_map_grant_ref op;
+	int ret;
+	
+	gnttab_set_map_op(&op, (unsigned long)vaddr, flags,
+			  gnt_ref, dev->otherend_id);
+
+	gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &op);
+
+	if (op.status != GNTST_okay) {
+		xenbus_dev_error
+			(dev, op.status,
+			 "failed mapping in shared page %d from domain %d\n",
+			 gnt_ref, dev->otherend_id);
+		ret = -EINVAL;
+	} else {
+		*handle = op.handle;
+		if (dev_bus_addr)
+			*dev_bus_addr = op.dev_bus_addr;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+
+/* Based on xenbus_backend_client.c:xenbus_unmap_ring() */
+static int net_accel_unmap_grant(struct xenbus_device *dev, 
+				 grant_handle_t handle,
+				 void *vaddr, u64 dev_bus_addr,
+				 unsigned flags)
+{
+	struct gnttab_unmap_grant_ref op;
+
+	gnttab_set_unmap_op(&op, (unsigned long)vaddr, flags, handle);
+	
+	if (dev_bus_addr)
+		op.dev_bus_addr = dev_bus_addr;
+
+	BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
+
+	if (op.status != GNTST_okay)
+		xenbus_dev_error(dev, op.status,
+				 "failed unmapping page at handle %d error %d\n",
+				 handle, op.status);
+
+	return op.status == GNTST_okay ? 0 : -EINVAL;
+}
+
+
+int net_accel_map_device_page(struct xenbus_device *dev,  
+			      int gnt_ref, grant_handle_t *handle,
+			      u64 *dev_bus_addr)
+{
+	return net_accel_map_grant(dev, gnt_ref, handle, 0, dev_bus_addr,
+				   GNTMAP_device_map);
+}
+EXPORT_SYMBOL_GPL(net_accel_map_device_page);
+
+ 
+int net_accel_unmap_device_page(struct xenbus_device *dev,
+				grant_handle_t handle, u64 dev_bus_addr)
+{
+	return net_accel_unmap_grant(dev, handle, 0, dev_bus_addr, 
+				     GNTMAP_device_map);
+}
+EXPORT_SYMBOL_GPL(net_accel_unmap_device_page);
+
+
+struct net_accel_valloc_grant_mapping {
+	struct vm_struct *vm;
+	int pages;
+	grant_handle_t grant_handles[0];
+};
+
+/* Map a series of grants into a contiguous virtual area */
+static void *net_accel_map_grants_valloc(struct xenbus_device *dev, 
+					 unsigned *grants, int npages, 
+					 unsigned flags, void **priv)
+{
+	struct net_accel_valloc_grant_mapping *map;
+	struct vm_struct *vm;
+	void *addr;
+	int i, j, rc;
+
+	vm  = alloc_vm_area(PAGE_SIZE * npages, NULL);
+	if (vm == NULL) {
+		EPRINTK("No memory from alloc_vm_area.\n");
+		return NULL;
+	}
+	/* 
+	 * Get a structure in which we will record all the info needed
+	 * to undo the mapping.
+	 */
+	map = kzalloc(sizeof(struct net_accel_valloc_grant_mapping)  + 
+		      npages * sizeof(grant_handle_t), GFP_KERNEL);
+	if (map == NULL) {
+		EPRINTK("No memory for net_accel_valloc_grant_mapping\n");
+		free_vm_area(vm);
+		return NULL;
+	}
+	map->vm = vm;
+	map->pages = npages;
+
+	/* Do the actual mapping */
+	addr = vm->addr;
+
+	for (i = 0; i < npages; i++) {
+		rc = net_accel_map_grant(dev, grants[i], map->grant_handles + i, 
+					 addr, NULL, flags);
+		if (rc < 0)
+			goto undo;
+		addr = (void*)((unsigned long)addr + PAGE_SIZE);
+	}
+
+	if (priv)
+		*priv = (void *)map;
+	else
+		kfree(map);
+
+	return vm->addr;
+
+ undo:
+	EPRINTK("Aborting contig map due to single map failure %d (%d of %d)\n",
+		rc, i+1, npages);
+	for (j = 0; j < i; j++) {
+		addr = (void*)((unsigned long)vm->addr + (j * PAGE_SIZE));
+		net_accel_unmap_grant(dev, map->grant_handles[j], addr, 0,
+				      flags);
+	}
+	free_vm_area(vm);
+	kfree(map);
+	return NULL;
+}
+
+/* Undo the result of the mapping */
+static void net_accel_unmap_grants_vfree(struct xenbus_device *dev, 
+					 unsigned flags, void *priv)
+{
+	struct net_accel_valloc_grant_mapping *map = 
+		(struct net_accel_valloc_grant_mapping *)priv;
+
+	void *addr = map->vm->addr;
+	int npages = map->pages;
+	int i;
+
+	for (i = 0; i < npages; i++) {
+		net_accel_unmap_grant(dev, map->grant_handles[i], addr, 0,
+				      flags);
+		addr = (void*)((unsigned long)addr + PAGE_SIZE);
+	}
+	free_vm_area(map->vm);
+	kfree(map);
+}
+
+
+void *net_accel_map_grants_contig(struct xenbus_device *dev,
+				unsigned *grants, int npages, 
+				void **priv)
+{
+    return net_accel_map_grants_valloc(dev, grants, npages, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_map_grants_contig);
+
+
+void net_accel_unmap_grants_contig(struct xenbus_device *dev,
+				   void *priv)
+{
+	net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_unmap_grants_contig);
+
+
+void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref,
+			     void **priv)
+{
+	return net_accel_map_grants_valloc(dev, &gnt_ref, 1, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_map_iomem_page);
+
+
+void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv)
+{
+	net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_unmap_iomem_page);
+
+
+int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn, 
+			 int is_iomem)
+{
+	int err = gnttab_grant_foreign_access(dev->otherend_id, mfn,
+					      is_iomem ? GTF_PCD : 0);
+	if (err < 0)
+		xenbus_dev_error(dev, err, "failed granting access to page\n");
+	return err;
+}
+EXPORT_SYMBOL_GPL(net_accel_grant_page);
+
+
+int net_accel_ungrant_page(grant_ref_t gntref)
+{
+	if (unlikely(gnttab_query_foreign_access(gntref) != 0)) {
+		EPRINTK("%s: remote domain still using grant %d\n", __FUNCTION__, 
+			gntref);
+		return -EBUSY;
+	}
+
+	gnttab_end_foreign_access(gntref, 0);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_ungrant_page);
+
+
+int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+	char *s, *e, *macstr;
+	int i;
+
+	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+	if (IS_ERR(macstr))
+		return PTR_ERR(macstr);
+
+	for (i = 0; i < ETH_ALEN; i++) {
+		mac[i] = simple_strtoul(s, &e, 16);
+		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+			kfree(macstr);
+			return -ENOENT;
+		}
+		s = e+1;
+	}
+
+	kfree(macstr);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_xen_net_read_mac);
+
+
+void net_accel_update_state(struct xenbus_device *dev, int state)
+{
+	struct xenbus_transaction tr;
+	int err;
+
+	DPRINTK("%s: setting accelstate to %s\n", __FUNCTION__,
+		xenbus_strstate(state));
+
+	if (xenbus_exists(XBT_NIL, dev->nodename, "")) {
+		VPRINTK("%s: nodename %s\n", __FUNCTION__, dev->nodename);
+	again:
+		err = xenbus_transaction_start(&tr);
+		if (err == 0)
+			err = xenbus_printf(tr, dev->nodename, "accelstate",
+					    "%d", state);
+		if (err != 0) {
+			xenbus_transaction_end(tr, 1);
+		} else {
+			err = xenbus_transaction_end(tr, 0);
+			if (err == -EAGAIN)
+				goto again;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(net_accel_update_state);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/sfc_netutil/accel_util.h b/drivers/xen/sfc_netutil/accel_util.h
new file mode 100644
index 0000000..66f96d8
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_util.h
@@ -0,0 +1,124 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETBACK_ACCEL_UTIL_H
+#define NETBACK_ACCEL_UTIL_H
+
+#ifdef DPRINTK
+#undef DPRINTK
+#endif
+
+#define FILE_LEAF strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__
+
+#if 1
+#define VPRINTK(_f, _a...) 
+#else
+#define VPRINTK(_f, _a...)			\
+	printk("(file=%s, line=%d) " _f,	\
+	       FILE_LEAF , __LINE__ , ## _a )
+#endif
+
+#if 1
+#define DPRINTK(_f, _a...) 
+#else
+#define DPRINTK(_f, _a...)			\
+	printk("(file=%s, line=%d) " _f,	\
+	       FILE_LEAF , __LINE__ , ## _a )
+#endif
+
+#define EPRINTK(_f, _a...)			\
+	printk("(file=%s, line=%d) " _f,	\
+	       FILE_LEAF , __LINE__ , ## _a )
+
+#define EPRINTK_ON(exp)							\
+	do {								\
+		if (exp)						\
+			EPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \
+	} while(0)
+
+#define DPRINTK_ON(exp)							\
+	do {								\
+		if (exp)						\
+			DPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \
+	} while(0)
+
+#include <xen/xenbus.h>
+
+/*! Map a set of pages from another domain
+ * \param dev The xenbus device context
+ * \param priv The private data returned by the mapping function 
+ */
+extern 
+void *net_accel_map_grants_contig(struct xenbus_device *dev, 
+				  unsigned *grants, int npages, 
+				  void **priv);
+
+/*! Unmap a set of pages mapped using net_accel_map_grants_contig.
+ * \param dev The xenbus device context
+ * \param priv The private data returned by the mapping function 
+ */
+extern 
+void net_accel_unmap_grants_contig(struct xenbus_device *dev, void *priv);
+
+/*! Read the MAC address of a device from xenstore */
+extern
+int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[]);
+
+/*! Update the accelstate field for a device in xenstore */
+extern
+void net_accel_update_state(struct xenbus_device *dev, int state);
+
+/* These four map/unmap functions are based on
+ * xenbus_backend_client.c:xenbus_map_ring().  However, they are not
+ * used for ring buffers, instead just to map pages between domains,
+ * or to map a page so that it is accessible by a device
+ */
+extern
+int net_accel_map_device_page(struct xenbus_device *dev,  
+			      int gnt_ref, grant_handle_t *handle,
+			      u64 *dev_bus_addr);
+extern
+int net_accel_unmap_device_page(struct xenbus_device *dev,
+				grant_handle_t handle, u64 dev_bus_addr);
+extern
+void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref,
+			     void **priv);
+extern
+void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv);
+
+/*! Grrant a page to remote domain */
+extern
+int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn, 
+			 int is_iomem);
+/*! Undo a net_accel_grant_page */
+extern
+int net_accel_ungrant_page(grant_ref_t gntref);
+
+
+/*! Shutdown remote domain that is misbehaving */
+extern
+int net_accel_shutdown_remote(int domain);
+
+
+#endif
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 1e0fe01..3f0283e 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -12,14 +12,20 @@
 #include <linux/module.h>
 #include <linux/kobject.h>
 
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <asm/hypervisor.h>
+#else
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
+#endif
 
 #include <xen/xen.h>
 #include <xen/xenbus.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>
 
+#include "xenbus/xenbus_comms.h"
+
 #define HYPERVISOR_ATTR_RO(_name) \
 static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
 
@@ -97,7 +103,7 @@ static struct attribute *version_attrs[] = {
 	NULL
 };
 
-static struct attribute_group version_group = {
+static const struct attribute_group version_group = {
 	.name = "version",
 	.attrs = version_attrs,
 };
@@ -118,9 +124,8 @@ static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
 {
 	char *vm, *val;
 	int ret;
-	extern int xenstored_ready;
 
-	if (!xenstored_ready)
+	if (!is_xenstored_ready())
 		return -EBUSY;
 
 	vm = xenbus_read(XBT_NIL, "vm", "", NULL);
@@ -210,7 +215,7 @@ static struct attribute *xen_compile_attrs[] = {
 	NULL
 };
 
-static struct attribute_group xen_compilation_group = {
+static const struct attribute_group xen_compilation_group = {
 	.name = "compilation",
 	.attrs = xen_compile_attrs,
 };
@@ -340,7 +345,7 @@ static struct attribute *xen_properties_attrs[] = {
 	NULL
 };
 
-static struct attribute_group xen_properties_group = {
+static const struct attribute_group xen_properties_group = {
 	.name = "properties",
 	.attrs = xen_properties_attrs,
 };
@@ -355,6 +360,35 @@ static void xen_properties_destroy(void)
 	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
 }
 
+#if defined(CONFIG_XEN) && defined(CONFIG_KEXEC)
+extern size_t vmcoreinfo_size_xen;
+extern unsigned long paddr_vmcoreinfo_xen;
+
+static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page)
+{
+	return sprintf(page, "%lx %zx\n",
+		       paddr_vmcoreinfo_xen, vmcoreinfo_size_xen);
+}
+
+HYPERVISOR_ATTR_RO(vmcoreinfo);
+
+static int __init xen_sysfs_vmcoreinfo_init(void)
+{
+	if (!vmcoreinfo_size_xen)
+		return 0;
+	return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+
+static void xen_sysfs_vmcoreinfo_destroy(void)
+{
+	if (vmcoreinfo_size_xen)
+		sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+#else
+static inline int __init xen_sysfs_vmcoreinfo_init(void) { return 0; }
+static inline void xen_sysfs_vmcoreinfo_destroy(void) { }
+#endif
+
 static int __init hyper_sysfs_init(void)
 {
 	int ret;
@@ -377,9 +411,11 @@ static int __init hyper_sysfs_init(void)
 	ret = xen_properties_init();
 	if (ret)
 		goto prop_out;
+	ret = xen_sysfs_vmcoreinfo_init();
+	if (!ret)
+		goto out;
 
-	goto out;
-
+	xen_properties_destroy();
 prop_out:
 	xen_sysfs_uuid_destroy();
 uuid_out:
@@ -394,6 +430,7 @@ out:
 
 static void __exit hyper_sysfs_exit(void)
 {
+	xen_sysfs_vmcoreinfo_destroy();
 	xen_properties_destroy();
 	xen_compilation_destroy();
 	xen_sysfs_uuid_destroy();
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index d369965..e00cf21 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -19,27 +19,14 @@
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/tmem.h>
+#ifdef CONFIG_PARAVIRT_XEN
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
 #include <asm/xen/hypervisor.h>
-
-#define TMEM_CONTROL               0
-#define TMEM_NEW_POOL              1
-#define TMEM_DESTROY_POOL          2
-#define TMEM_NEW_PAGE              3
-#define TMEM_PUT_PAGE              4
-#define TMEM_GET_PAGE              5
-#define TMEM_FLUSH_PAGE            6
-#define TMEM_FLUSH_OBJECT          7
-#define TMEM_READ                  8
-#define TMEM_WRITE                 9
-#define TMEM_XCHG                 10
-
-/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
-#define TMEM_POOL_PERSIST          1
-#define TMEM_POOL_SHARED           2
-#define TMEM_POOL_PAGESIZE_SHIFT   4
-#define TMEM_VERSION_SHIFT        24
+#else
+#include <asm/hypervisor.h>
+#endif
 
 
 struct tmem_pool_uuid {
@@ -74,7 +61,7 @@ static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid,
 	op.u.gen.tmem_offset = tmem_offset;
 	op.u.gen.pfn_offset = pfn_offset;
 	op.u.gen.len = len;
-	set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn);
+	op.u.gen.cmfn = gmfn;
 	rc = HYPERVISOR_tmem_op(&op);
 	return rc;
 }
@@ -88,11 +75,11 @@ static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
 	for (pageshift = 0; pagesize != 1; pageshift++)
 		pagesize >>= 1;
 	flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT;
-	flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT;
+	flags |= TMEM_SPEC_VERSION << TMEM_POOL_VERSION_SHIFT;
 	op.cmd = TMEM_NEW_POOL;
-	op.u.new.uuid[0] = uuid.uuid_lo;
-	op.u.new.uuid[1] = uuid.uuid_hi;
-	op.u.new.flags = flags;
+	op.u.creat.uuid[0] = uuid.uuid_lo;
+	op.u.creat.uuid[1] = uuid.uuid_hi;
+	op.u.creat.flags = flags;
 	rc = HYPERVISOR_tmem_op(&op);
 	return rc;
 }
@@ -242,9 +229,9 @@ __setup("nocleancache", no_cleancache);
 static struct cleancache_ops tmem_cleancache_ops = {
 	.put_page = tmem_cleancache_put_page,
 	.get_page = tmem_cleancache_get_page,
-	.flush_page = tmem_cleancache_flush_page,
-	.flush_inode = tmem_cleancache_flush_inode,
-	.flush_fs = tmem_cleancache_flush_fs,
+	.invalidate_page = tmem_cleancache_flush_page,
+	.invalidate_inode = tmem_cleancache_flush_inode,
+	.invalidate_fs = tmem_cleancache_flush_fs,
 	.init_shared_fs = tmem_cleancache_init_shared_fs,
 	.init_fs = tmem_cleancache_init_fs
 };
@@ -369,8 +356,8 @@ __setup("nofrontswap", no_frontswap);
 static struct frontswap_ops tmem_frontswap_ops = {
 	.put_page = tmem_frontswap_put_page,
 	.get_page = tmem_frontswap_get_page,
-	.flush_page = tmem_frontswap_flush_page,
-	.flush_area = tmem_frontswap_flush_area,
+	.invalidate_page = tmem_frontswap_flush_page,
+	.invalidate_area = tmem_frontswap_flush_area,
 	.init = tmem_frontswap_init
 };
 #endif
@@ -393,7 +380,7 @@ static int __init xen_tmem_init(void)
 	}
 #endif
 #ifdef CONFIG_CLEANCACHE
-	BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
+	BUILD_BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
 	if (tmem_enabled && use_cleancache) {
 		char *s = "";
 		struct cleancache_ops old_ops =
diff --git a/drivers/xen/tpmback/Makefile b/drivers/xen/tpmback/Makefile
new file mode 100644
index 0000000..d5865c4
--- /dev/null
+++ b/drivers/xen/tpmback/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_XEN_TPMDEV_BACKEND)	+= tpmbk.o
+
+tpmbk-y += tpmback.o interface.o xenbus.o
diff --git a/drivers/xen/tpmback/common.h b/drivers/xen/tpmback/common.h
new file mode 100644
index 0000000..c5ec097
--- /dev/null
+++ b/drivers/xen/tpmback/common.h
@@ -0,0 +1,93 @@
+/******************************************************************************
+ * drivers/xen/tpmback/common.h
+ */
+
+#ifndef __TPM__BACKEND__COMMON_H__
+#define __TPM__BACKEND__COMMON_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/io/tpmif.h>
+
+#define DPRINTK(_f, _a...)			\
+	pr_debug("(file=%s, line=%d) " _f,	\
+		 __FILE__ , __LINE__ , ## _a )
+
+struct backend_info
+{
+	struct xenbus_device *dev;
+
+	/* our communications channel */
+	struct tpmif_st *tpmif;
+
+	long int frontend_id;
+	long int instance; // instance of TPM
+	u8 is_instance_set;// whether instance number has been set
+
+	/* watch front end for changes */
+	struct xenbus_watch backend_watch;
+};
+
+typedef struct tpmif_st {
+	struct list_head tpmif_list;
+	/* Unique identifier for this interface. */
+	domid_t domid;
+	unsigned int handle;
+
+	/* Physical parameters of the comms window. */
+	unsigned int irq;
+
+	/* The shared rings and indexes. */
+	tpmif_tx_interface_t *tx;
+	struct vm_struct *tx_area;
+
+	/* Miscellaneous private stuff. */
+	enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+	int active;
+
+	struct tpmif_st *hash_next;
+	struct list_head list;	/* scheduling list */
+	atomic_t refcnt;
+
+	struct backend_info *bi;
+
+	struct page **mmap_pages;
+
+	char devname[20];
+} tpmif_t;
+
+void tpmif_disconnect_complete(tpmif_t * tpmif);
+tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi);
+int tpmif_interface_init(void);
+void tpmif_interface_exit(void);
+void tpmif_schedule_work(tpmif_t * tpmif);
+void tpmif_deschedule_work(tpmif_t * tpmif);
+int tpmif_xenbus_init(void);
+void tpmif_xenbus_exit(void);
+int tpmif_map(tpmif_t *, grant_ref_t, evtchn_port_t);
+irqreturn_t tpmif_be_int(int irq, void *dev_id);
+
+long int tpmback_get_instance(struct backend_info *bi);
+
+int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
+
+
+#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define tpmif_put(_b)					\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->refcnt))	\
+			tpmif_disconnect_complete(_b);	\
+	} while (0)
+
+extern int num_frontends;
+
+static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx)
+{
+	return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx]));
+}
+
+#endif /* __TPMIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/tpmback/interface.c b/drivers/xen/tpmback/interface.c
new file mode 100644
index 0000000..37850c8
--- /dev/null
+++ b/drivers/xen/tpmback/interface.c
@@ -0,0 +1,133 @@
+ /*****************************************************************************
+ * drivers/xen/tpmback/interface.c
+ *
+ * Vritual TPM interface management.
+ *
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ *
+ * This code has been derived from drivers/xen/netback/interface.c
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+
+static struct kmem_cache *tpmif_cachep;
+int num_frontends = 0;
+
+LIST_HEAD(tpmif_list);
+
+static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi)
+{
+	tpmif_t *tpmif;
+
+	tpmif = kmem_cache_zalloc(tpmif_cachep, GFP_KERNEL);
+	if (tpmif == NULL)
+		goto out_of_memory;
+
+	tpmif->domid = domid;
+	tpmif->status = DISCONNECTED;
+	tpmif->bi = bi;
+	snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid);
+	atomic_set(&tpmif->refcnt, 1);
+
+	tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE);
+	if (tpmif->mmap_pages == NULL)
+		goto out_of_memory;
+
+	list_add(&tpmif->tpmif_list, &tpmif_list);
+	num_frontends++;
+
+	return tpmif;
+
+ out_of_memory:
+	if (tpmif != NULL)
+		kmem_cache_free(tpmif_cachep, tpmif);
+	pr_err("%s: out of memory\n", __FUNCTION__);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void free_tpmif(tpmif_t * tpmif)
+{
+	num_frontends--;
+	list_del(&tpmif->tpmif_list);
+	free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE);
+	kmem_cache_free(tpmif_cachep, tpmif);
+}
+
+tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi)
+{
+	tpmif_t *tpmif;
+
+	list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
+		if (tpmif->bi == bi) {
+			if (tpmif->domid == domid) {
+				tpmif_get(tpmif);
+				return tpmif;
+			} else {
+				return ERR_PTR(-EEXIST);
+			}
+		}
+	}
+
+	return alloc_tpmif(domid, bi);
+}
+
+int tpmif_map(tpmif_t *tpmif, grant_ref_t ring_ref, evtchn_port_t evtchn)
+{
+	struct vm_struct *area;
+	int err;
+
+	if (tpmif->irq)
+		return 0;
+
+	area = xenbus_map_ring_valloc(tpmif->bi->dev, ring_ref);
+	if (IS_ERR(area))
+		return PTR_ERR(area);
+	tpmif->tx_area = area;
+
+	tpmif->tx = (tpmif_tx_interface_t *)area->addr;
+	clear_page(tpmif->tx);
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+		tpmif->domid, evtchn, tpmif_be_int, 0, tpmif->devname, tpmif);
+	if (err < 0) {
+		xenbus_unmap_ring_vfree(tpmif->bi->dev, area);
+		return err;
+	}
+	tpmif->irq = err;
+
+	tpmif->active = 1;
+
+	return 0;
+}
+
+void tpmif_disconnect_complete(tpmif_t *tpmif)
+{
+	if (tpmif->irq)
+		unbind_from_irqhandler(tpmif->irq, tpmif);
+
+	if (tpmif->tx)
+		xenbus_unmap_ring_vfree(tpmif->bi->dev, tpmif->tx_area);
+
+	free_tpmif(tpmif);
+}
+
+int __init tpmif_interface_init(void)
+{
+	tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
+					 0, 0, NULL);
+	return tpmif_cachep ? 0 : -ENOMEM;
+}
+
+void tpmif_interface_exit(void)
+{
+	kmem_cache_destroy(tpmif_cachep);
+}
diff --git a/drivers/xen/tpmback/tpmback.c b/drivers/xen/tpmback/tpmback.c
new file mode 100644
index 0000000..0a80f83
--- /dev/null
+++ b/drivers/xen/tpmback/tpmback.c
@@ -0,0 +1,947 @@
+/******************************************************************************
+ * drivers/xen/tpmback/tpmback.c
+ *
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ * Grant table support: Mahadevan Gomathisankaran
+ *
+ * This code has been derived from drivers/xen/netback/netback.c
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <xen/xenbus.h>
+#include <xen/interface/grant_table.h>
+#include <xen/gnttab.h>
+
+/* local data structures */
+struct data_exchange {
+	struct list_head pending_pak;
+	struct list_head current_pak;
+	unsigned int copied_so_far;
+	u8 has_opener:1;
+	u8 aborted:1;
+	rwlock_t pak_lock;	// protects all of the previous fields
+	wait_queue_head_t wait_queue;
+};
+
+struct vtpm_resp_hdr {
+	uint32_t instance_no;
+	uint16_t tag_no;
+	uint32_t len_no;
+	uint32_t ordinal_no;
+} __attribute__ ((packed));
+
+struct packet {
+	struct list_head next;
+	unsigned int data_len;
+	u8 *data_buffer;
+	tpmif_t *tpmif;
+	u32 tpm_instance;
+	u8 req_tag;
+	u32 last_read;
+	u8 flags;
+	struct timer_list processing_timer;
+};
+
+enum {
+	PACKET_FLAG_DISCARD_RESPONSE = 1,
+};
+
+/* local variables */
+static struct data_exchange dataex;
+
+/* local function prototypes */
+static int _packet_write(struct packet *pak,
+			 const char *data, size_t size, int userbuffer);
+static void processing_timeout(unsigned long ptr);
+static int packet_read_shmem(struct packet *pak,
+			     tpmif_t * tpmif,
+			     u32 offset,
+			     char *buffer, int isuserbuffer, u32 left);
+static int vtpm_queue_packet(struct packet *pak);
+
+/***************************************************************
+ Buffer copying fo user and kernel space buffes.
+***************************************************************/
+static inline int copy_from_buffer(void *to,
+				   const void *from, unsigned long size,
+				   int isuserbuffer)
+{
+	if (isuserbuffer) {
+		if (copy_from_user(to, (void __user *)from, size))
+			return -EFAULT;
+	} else {
+		memcpy(to, from, size);
+	}
+	return 0;
+}
+
+static inline int copy_to_buffer(void *to,
+				 const void *from, unsigned long size,
+				 int isuserbuffer)
+{
+	if (isuserbuffer) {
+		if (copy_to_user((void __user *)to, from, size))
+			return -EFAULT;
+	} else {
+		memcpy(to, from, size);
+	}
+	return 0;
+}
+
+
+static void dataex_init(struct data_exchange *dataex)
+{
+	INIT_LIST_HEAD(&dataex->pending_pak);
+	INIT_LIST_HEAD(&dataex->current_pak);
+	dataex->has_opener = 0;
+	rwlock_init(&dataex->pak_lock);
+	init_waitqueue_head(&dataex->wait_queue);
+}
+
+/***************************************************************
+ Packet-related functions
+***************************************************************/
+
+static struct packet *packet_find_instance(struct list_head *head,
+					   u32 tpm_instance)
+{
+	struct packet *pak;
+	struct list_head *p;
+
+	/*
+	 * traverse the list of packets and return the first
+	 * one with the given instance number
+	 */
+	list_for_each(p, head) {
+		pak = list_entry(p, struct packet, next);
+
+		if (pak->tpm_instance == tpm_instance) {
+			return pak;
+		}
+	}
+	return NULL;
+}
+
+static struct packet *packet_find_packet(struct list_head *head, void *packet)
+{
+	struct packet *pak;
+	struct list_head *p;
+
+	/*
+	 * traverse the list of packets and return the first
+	 * one with the given instance number
+	 */
+	list_for_each(p, head) {
+		pak = list_entry(p, struct packet, next);
+
+		if (pak == packet) {
+			return pak;
+		}
+	}
+	return NULL;
+}
+
+static struct packet *packet_alloc(tpmif_t * tpmif,
+				   u32 size, u8 req_tag, u8 flags)
+{
+	struct packet *pak = NULL;
+	pak = kzalloc(sizeof (struct packet), GFP_ATOMIC);
+	if (NULL != pak) {
+		if (tpmif) {
+			pak->tpmif = tpmif;
+			pak->tpm_instance = tpmback_get_instance(tpmif->bi);
+			tpmif_get(tpmif);
+		}
+		pak->data_len = size;
+		pak->req_tag = req_tag;
+		pak->last_read = 0;
+		pak->flags = flags;
+
+		/*
+		 * cannot do tpmif_get(tpmif); bad things happen
+		 * on the last tpmif_put()
+		 */
+		init_timer(&pak->processing_timer);
+		pak->processing_timer.function = processing_timeout;
+		pak->processing_timer.data = (unsigned long)pak;
+	}
+	return pak;
+}
+
+static void inline packet_reset(struct packet *pak)
+{
+	pak->last_read = 0;
+}
+
+static void packet_free(struct packet *pak)
+{
+	if (timer_pending(&pak->processing_timer)) {
+		BUG();
+	}
+
+	if (pak->tpmif)
+		tpmif_put(pak->tpmif);
+	kfree(pak->data_buffer);
+	/*
+	 * cannot do tpmif_put(pak->tpmif); bad things happen
+	 * on the last tpmif_put()
+	 */
+	kfree(pak);
+}
+
+
+/*
+ * Write data to the shared memory and send it to the FE.
+ */
+static int packet_write(struct packet *pak,
+			const char *data, size_t size, int isuserbuffer)
+{
+	int rc = 0;
+
+	if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
+		/* Don't send a respone to this packet. Just acknowledge it. */
+		rc = size;
+	} else {
+		rc = _packet_write(pak, data, size, isuserbuffer);
+	}
+
+	return rc;
+}
+
+int _packet_write(struct packet *pak,
+		  const char *data, size_t size, int isuserbuffer)
+{
+	/*
+	 * Write into the shared memory pages directly
+	 * and send it to the front end.
+	 */
+	tpmif_t *tpmif = pak->tpmif;
+	grant_handle_t handle;
+	int rc = 0;
+	unsigned int i = 0;
+	unsigned int offset = 0;
+
+	if (tpmif == NULL) {
+		return -EFAULT;
+	}
+
+	if (tpmif->status == DISCONNECTED) {
+		return size;
+	}
+
+	while (offset < size && i < TPMIF_TX_RING_SIZE) {
+		unsigned int tocopy;
+		struct gnttab_map_grant_ref map_op;
+		struct gnttab_unmap_grant_ref unmap_op;
+		tpmif_tx_request_t *tx;
+
+		tx = &tpmif->tx->ring[i].req;
+
+		if (0 == tx->addr) {
+			DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
+			return 0;
+		}
+
+		gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
+				  GNTMAP_host_map, tx->ref, tpmif->domid);
+
+		gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map_op);
+
+		if (map_op.status != GNTST_okay) {
+			DPRINTK(" Grant table operation failure !\n");
+			return 0;
+		}
+
+		handle = map_op.handle;
+
+		tocopy = min_t(size_t, size - offset, PAGE_SIZE);
+
+		if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) |
+					      (tx->addr & ~PAGE_MASK)),
+				     &data[offset], tocopy, isuserbuffer)) {
+			tpmif_put(tpmif);
+			return -EFAULT;
+		}
+		tx->size = tocopy;
+
+		gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
+				    GNTMAP_host_map, handle);
+
+		if (unlikely
+		    (HYPERVISOR_grant_table_op
+		     (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
+			BUG();
+		}
+
+		offset += tocopy;
+		i++;
+	}
+
+	rc = offset;
+	DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
+	notify_remote_via_irq(tpmif->irq);
+
+	return rc;
+}
+
+/*
+ * Read data from the shared memory and copy it directly into the
+ * provided buffer. Advance the read_last indicator which tells
+ * how many bytes have already been read.
+ */
+static int packet_read(struct packet *pak, size_t numbytes,
+		       char *buffer, size_t buffersize, int isuserbuffer)
+{
+	tpmif_t *tpmif = pak->tpmif;
+
+	/*
+	 * Read 'numbytes' of data from the buffer. The first 4
+	 * bytes are the instance number in network byte order,
+	 * after that come the data from the shared memory buffer.
+	 */
+	u32 to_copy;
+	u32 offset = 0;
+	u32 room_left = buffersize;
+
+	if (pak->last_read < 4) {
+		/*
+		 * copy the instance number into the buffer
+		 */
+		u32 instance_no = htonl(pak->tpm_instance);
+		u32 last_read = pak->last_read;
+
+		to_copy = min_t(size_t, 4 - last_read, numbytes);
+
+		if (copy_to_buffer(&buffer[0],
+				   &(((u8 *) & instance_no)[last_read]),
+				   to_copy, isuserbuffer)) {
+			return -EFAULT;
+		}
+
+		pak->last_read += to_copy;
+		offset += to_copy;
+		room_left -= to_copy;
+	}
+
+	/*
+	 * If the packet has a data buffer appended, read from it...
+	 */
+
+	if (room_left > 0) {
+		if (pak->data_buffer) {
+			u32 to_copy = min_t(u32, pak->data_len - offset, room_left);
+			u32 last_read = pak->last_read - 4;
+
+			if (copy_to_buffer(&buffer[offset],
+					   &pak->data_buffer[last_read],
+					   to_copy, isuserbuffer)) {
+				return -EFAULT;
+			}
+			pak->last_read += to_copy;
+			offset += to_copy;
+		} else {
+			offset = packet_read_shmem(pak,
+						   tpmif,
+						   offset,
+						   buffer,
+						   isuserbuffer, room_left);
+		}
+	}
+	return offset;
+}
+
+static int packet_read_shmem(struct packet *pak,
+			     tpmif_t * tpmif,
+			     u32 offset, char *buffer, int isuserbuffer,
+			     u32 room_left)
+{
+	u32 last_read = pak->last_read - 4;
+	u32 i = (last_read / PAGE_SIZE);
+	u32 pg_offset = last_read & (PAGE_SIZE - 1);
+	u32 to_copy;
+	grant_handle_t handle;
+
+	tpmif_tx_request_t *tx;
+
+	tx = &tpmif->tx->ring[0].req;
+	/*
+	 * Start copying data at the page with index 'index'
+	 * and within that page at offset 'offset'.
+	 * Copy a maximum of 'room_left' bytes.
+	 */
+	to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left);
+	while (to_copy > 0) {
+		void *src;
+		struct gnttab_map_grant_ref map_op;
+		struct gnttab_unmap_grant_ref unmap_op;
+
+		tx = &tpmif->tx->ring[i].req;
+
+		gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
+				  GNTMAP_host_map, tx->ref, tpmif->domid);
+
+		gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map_op);
+
+		if (map_op.status != GNTST_okay) {
+			DPRINTK(" Grant table operation failure !\n");
+			return -EFAULT;
+		}
+
+		handle = map_op.handle;
+
+		if (to_copy > tx->size) {
+			/*
+			 * User requests more than what's available
+			 */
+			to_copy = min_t(u32, tx->size, to_copy);
+		}
+
+		DPRINTK("Copying from mapped memory at %08lx\n",
+			(unsigned long)(idx_to_kaddr(tpmif, i) |
+					(tx->addr & ~PAGE_MASK)));
+
+		src = (void *)(idx_to_kaddr(tpmif, i) |
+			       ((tx->addr & ~PAGE_MASK) + pg_offset));
+		if (copy_to_buffer(&buffer[offset],
+				   src, to_copy, isuserbuffer)) {
+			return -EFAULT;
+		}
+
+		DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
+			tpmif->domid, buffer[offset], buffer[offset + 1],
+			buffer[offset + 2], buffer[offset + 3]);
+
+		gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
+				    GNTMAP_host_map, handle);
+
+		if (unlikely
+		    (HYPERVISOR_grant_table_op
+		     (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
+			BUG();
+		}
+
+		offset += to_copy;
+		pg_offset = 0;
+		last_read += to_copy;
+		room_left -= to_copy;
+
+		to_copy = min_t(u32, PAGE_SIZE, room_left);
+		i++;
+	}			/* while (to_copy > 0) */
+	/*
+	 * Adjust the last_read pointer
+	 */
+	pak->last_read = last_read + 4;
+	return offset;
+}
+
+/* ============================================================
+ * The file layer for reading data from this device
+ * ============================================================
+ */
+static int vtpm_op_open(struct inode *inode, struct file *f)
+{
+	int rc = 0;
+	unsigned long flags;
+
+	write_lock_irqsave(&dataex.pak_lock, flags);
+	if (dataex.has_opener == 0) {
+		dataex.has_opener = 1;
+	} else {
+		rc = -EPERM;
+	}
+	write_unlock_irqrestore(&dataex.pak_lock, flags);
+	return rc;
+}
+
+static ssize_t vtpm_op_read(struct file *file,
+			    char __user * data, size_t size, loff_t * offset)
+{
+	int ret_size = -ENODATA;
+	struct packet *pak = NULL;
+	unsigned long flags;
+
+	write_lock_irqsave(&dataex.pak_lock, flags);
+	if (dataex.aborted) {
+		dataex.aborted = 0;
+		dataex.copied_so_far = 0;
+		write_unlock_irqrestore(&dataex.pak_lock, flags);
+		return -EIO;
+	}
+
+	if (list_empty(&dataex.pending_pak)) {
+		write_unlock_irqrestore(&dataex.pak_lock, flags);
+		wait_event_interruptible(dataex.wait_queue,
+					 !list_empty(&dataex.pending_pak));
+		write_lock_irqsave(&dataex.pak_lock, flags);
+		dataex.copied_so_far = 0;
+	}
+
+	if (!list_empty(&dataex.pending_pak)) {
+		unsigned int left;
+
+		pak = list_entry(dataex.pending_pak.next, struct packet, next);
+		left = pak->data_len - dataex.copied_so_far;
+		list_del(&pak->next);
+		write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+		DPRINTK("size given by app: %zu, available: %u\n", size, left);
+
+		ret_size = min_t(size_t, size, left);
+
+		ret_size = packet_read(pak, ret_size, data, size, 1);
+
+		write_lock_irqsave(&dataex.pak_lock, flags);
+
+		if (ret_size < 0) {
+			del_singleshot_timer_sync(&pak->processing_timer);
+			packet_free(pak);
+			dataex.copied_so_far = 0;
+		} else {
+			DPRINTK("Copied %d bytes to user buffer\n", ret_size);
+
+			dataex.copied_so_far += ret_size;
+			if (dataex.copied_so_far >= pak->data_len + 4) {
+				DPRINTK("All data from this packet given to app.\n");
+				/* All data given to app */
+
+				del_singleshot_timer_sync(&pak->
+							  processing_timer);
+				list_add_tail(&pak->next, &dataex.current_pak);
+				/*
+				 * The more fontends that are handled at the same time,
+				 * the more time we give the TPM to process the request.
+				 */
+				mod_timer(&pak->processing_timer,
+					  jiffies + (num_frontends * 60 * HZ));
+				dataex.copied_so_far = 0;
+			} else {
+				list_add(&pak->next, &dataex.pending_pak);
+			}
+		}
+	}
+	write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+	DPRINTK("Returning result from read to app: %d\n", ret_size);
+
+	return ret_size;
+}
+
+/*
+ * Write operation - only works after a previous read operation!
+ */
+static ssize_t vtpm_op_write(struct file *file,
+			     const char __user * data, size_t size,
+			     loff_t * offset)
+{
+	struct packet *pak;
+	int rc = 0;
+	unsigned int off = 4;
+	unsigned long flags;
+	struct vtpm_resp_hdr vrh;
+
+	/*
+	 * Minimum required packet size is:
+	 * 4 bytes for instance number
+	 * 2 bytes for tag
+	 * 4 bytes for paramSize
+	 * 4 bytes for the ordinal
+	 * sum: 14 bytes
+	 */
+	if (size < sizeof (vrh))
+		return -EFAULT;
+
+	if (copy_from_user(&vrh, data, sizeof (vrh)))
+		return -EFAULT;
+
+	/* malformed packet? */
+	if ((off + ntohl(vrh.len_no)) != size)
+		return -EFAULT;
+
+	write_lock_irqsave(&dataex.pak_lock, flags);
+	pak = packet_find_instance(&dataex.current_pak,
+				   ntohl(vrh.instance_no));
+
+	if (pak == NULL) {
+		write_unlock_irqrestore(&dataex.pak_lock, flags);
+		DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n",
+		        ntohl(vrh.instance_no));
+		return -EFAULT;
+	}
+
+	del_singleshot_timer_sync(&pak->processing_timer);
+	list_del(&pak->next);
+
+	write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+	/*
+	 * The first 'offset' bytes must be the instance number - skip them.
+	 */
+	size -= off;
+
+	rc = packet_write(pak, &data[off], size, 1);
+
+	if (rc > 0) {
+		/* I neglected the first 4 bytes */
+		rc += off;
+	}
+	packet_free(pak);
+	return rc;
+}
+
+static int vtpm_op_release(struct inode *inode, struct file *file)
+{
+	unsigned long flags;
+
+	vtpm_release_packets(NULL, 1);
+	write_lock_irqsave(&dataex.pak_lock, flags);
+	dataex.has_opener = 0;
+	write_unlock_irqrestore(&dataex.pak_lock, flags);
+	return 0;
+}
+
+static unsigned int vtpm_op_poll(struct file *file,
+				 struct poll_table_struct *pts)
+{
+	unsigned int flags = POLLOUT | POLLWRNORM;
+
+	poll_wait(file, &dataex.wait_queue, pts);
+	if (!list_empty(&dataex.pending_pak)) {
+		flags |= POLLIN | POLLRDNORM;
+	}
+	return flags;
+}
+
+static const struct file_operations vtpm_ops = {
+	.owner = THIS_MODULE,
+	.llseek = no_llseek,
+	.open = vtpm_op_open,
+	.read = vtpm_op_read,
+	.write = vtpm_op_write,
+	.release = vtpm_op_release,
+	.poll = vtpm_op_poll,
+};
+
+static struct miscdevice vtpms_miscdevice = {
+	.minor = 225,
+	.name = "vtpm",
+	.fops = &vtpm_ops,
+};
+
+/***************************************************************
+ Utility functions
+***************************************************************/
+
+static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
+{
+	int rc;
+	static const unsigned char tpm_error_message_fail[] = {
+		0x00, 0x00,
+		0x00, 0x00, 0x00, 0x0a,
+		0x00, 0x00, 0x00, 0x09	/* TPM_FAIL */
+	};
+	unsigned char buffer[sizeof (tpm_error_message_fail)];
+
+	memcpy(buffer, tpm_error_message_fail,
+	       sizeof (tpm_error_message_fail));
+	/*
+	 * Insert the right response tag depending on the given tag
+	 * All response tags are '+3' to the request tag.
+	 */
+	buffer[1] = req_tag + 3;
+
+	/*
+	 * Write the data to shared memory and notify the front-end
+	 */
+	rc = packet_write(pak, buffer, sizeof (buffer), 0);
+
+	return rc;
+}
+
+static int _vtpm_release_packets(struct list_head *head,
+				 tpmif_t * tpmif, int send_msgs)
+{
+	int aborted = 0;
+	int c = 0;
+	struct packet *pak;
+	struct list_head *pos, *tmp;
+
+	list_for_each_safe(pos, tmp, head) {
+		pak = list_entry(pos, struct packet, next);
+		c += 1;
+
+		if (tpmif == NULL || pak->tpmif == tpmif) {
+			int can_send = 0;
+
+			del_singleshot_timer_sync(&pak->processing_timer);
+			list_del(&pak->next);
+
+			if (pak->tpmif && pak->tpmif->status == CONNECTED) {
+				can_send = 1;
+			}
+
+			if (send_msgs && can_send) {
+				tpm_send_fail_message(pak, pak->req_tag);
+			}
+			packet_free(pak);
+			if (c == 1)
+				aborted = 1;
+		}
+	}
+	return aborted;
+}
+
+int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&dataex.pak_lock, flags);
+
+	dataex.aborted = _vtpm_release_packets(&dataex.pending_pak,
+					       tpmif,
+					       send_msgs);
+	_vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
+
+	write_unlock_irqrestore(&dataex.pak_lock, flags);
+	return 0;
+}
+
+static int vtpm_queue_packet(struct packet *pak)
+{
+	int rc = 0;
+
+	if (dataex.has_opener) {
+		unsigned long flags;
+
+		write_lock_irqsave(&dataex.pak_lock, flags);
+		list_add_tail(&pak->next, &dataex.pending_pak);
+		/* give the TPM some time to pick up the request */
+		mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
+		write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+		wake_up_interruptible(&dataex.wait_queue);
+	} else {
+		rc = -EFAULT;
+	}
+	return rc;
+}
+
+static int vtpm_receive(tpmif_t * tpmif, u32 size)
+{
+	int rc = 0;
+	unsigned char buffer[10];
+	__be32 *native_size;
+	struct packet *pak = packet_alloc(tpmif, size, 0, 0);
+
+	if (!pak)
+		return -ENOMEM;
+	/*
+	 * Read 10 bytes from the received buffer to test its
+	 * content for validity.
+	 */
+	if (sizeof (buffer) != packet_read(pak,
+					   sizeof (buffer), buffer,
+					   sizeof (buffer), 0)) {
+		goto failexit;
+	}
+	/*
+	 * Reset the packet read pointer so we can read all its
+	 * contents again.
+	 */
+	packet_reset(pak);
+
+	native_size = (__force __be32 *) (&buffer[4 + 2]);
+	/*
+	 * Verify that the size of the packet is correct
+	 * as indicated and that there's actually someone reading packets.
+	 * The minimum size of the packet is '10' for tag, size indicator
+	 * and ordinal.
+	 */
+	if (size < 10 ||
+	    be32_to_cpu(*native_size) != size ||
+	    0 == dataex.has_opener || tpmif->status != CONNECTED) {
+		rc = -EINVAL;
+		goto failexit;
+	} else {
+		rc = vtpm_queue_packet(pak);
+		if (rc < 0)
+			goto failexit;
+	}
+	return 0;
+
+      failexit:
+	if (pak) {
+		tpm_send_fail_message(pak, buffer[4 + 1]);
+		packet_free(pak);
+	}
+	return rc;
+}
+
+/*
+ * Timeout function that gets invoked when a packet has not been processed
+ * during the timeout period.
+ * The packet must be on a list when this function is invoked. This
+ * also means that once its taken off a list, the timer must be
+ * destroyed as well.
+ */
+static void processing_timeout(unsigned long ptr)
+{
+	struct packet *pak = (struct packet *)ptr;
+	unsigned long flags;
+
+	write_lock_irqsave(&dataex.pak_lock, flags);
+	/*
+	 * The packet needs to be searched whether it
+	 * is still on the list.
+	 */
+	if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
+	    pak == packet_find_packet(&dataex.current_pak, pak)) {
+		if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
+			tpm_send_fail_message(pak, pak->req_tag);
+		}
+		/* discard future responses */
+		pak->flags |= PACKET_FLAG_DISCARD_RESPONSE;
+	}
+
+	write_unlock_irqrestore(&dataex.pak_lock, flags);
+}
+
+static void tpm_tx_action(unsigned long unused);
+static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
+
+static struct list_head tpm_schedule_list;
+static spinlock_t tpm_schedule_list_lock;
+
+static inline void maybe_schedule_tx_action(void)
+{
+	smp_mb();
+	tasklet_schedule(&tpm_tx_tasklet);
+}
+
+static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
+{
+	return tpmif->list.next != NULL;
+}
+
+static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
+{
+	spin_lock_irq(&tpm_schedule_list_lock);
+	if (likely(__on_tpm_schedule_list(tpmif))) {
+		list_del(&tpmif->list);
+		tpmif->list.next = NULL;
+		tpmif_put(tpmif);
+	}
+	spin_unlock_irq(&tpm_schedule_list_lock);
+}
+
+static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
+{
+	if (__on_tpm_schedule_list(tpmif))
+		return;
+
+	spin_lock_irq(&tpm_schedule_list_lock);
+	if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
+		list_add_tail(&tpmif->list, &tpm_schedule_list);
+		tpmif_get(tpmif);
+	}
+	spin_unlock_irq(&tpm_schedule_list_lock);
+}
+
+void tpmif_schedule_work(tpmif_t * tpmif)
+{
+	add_to_tpm_schedule_list_tail(tpmif);
+	maybe_schedule_tx_action();
+}
+
+void tpmif_deschedule_work(tpmif_t * tpmif)
+{
+	remove_from_tpm_schedule_list(tpmif);
+}
+
+static void tpm_tx_action(unsigned long unused)
+{
+	struct list_head *ent;
+	tpmif_t *tpmif;
+	tpmif_tx_request_t *tx;
+
+	DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
+
+	while (!list_empty(&tpm_schedule_list)) {
+		/* Get a tpmif from the list with work to do. */
+		ent = tpm_schedule_list.next;
+		tpmif = list_entry(ent, tpmif_t, list);
+		tpmif_get(tpmif);
+		remove_from_tpm_schedule_list(tpmif);
+
+		tx = &tpmif->tx->ring[0].req;
+
+		/* pass it up */
+		vtpm_receive(tpmif, tx->size);
+
+		tpmif_put(tpmif);
+	}
+}
+
+irqreturn_t tpmif_be_int(int irq, void *dev_id)
+{
+	tpmif_t *tpmif = (tpmif_t *) dev_id;
+
+	add_to_tpm_schedule_list_tail(tpmif);
+	maybe_schedule_tx_action();
+	return IRQ_HANDLED;
+}
+
+static int __init tpmback_init(void)
+{
+	int rc;
+
+	if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
+		pr_alert("Could not register misc device for TPM BE\n");
+		return rc;
+	}
+
+	dataex_init(&dataex);
+
+	spin_lock_init(&tpm_schedule_list_lock);
+	INIT_LIST_HEAD(&tpm_schedule_list);
+
+	rc = tpmif_interface_init();
+	if (!rc) {
+		rc = tpmif_xenbus_init();
+		if (rc)
+			tpmif_interface_exit();
+	}
+	if (rc) {
+		misc_deregister(&vtpms_miscdevice);
+		return rc;
+	}
+
+	pr_alert("Successfully initialized TPM backend driver\n");
+
+	return 0;
+}
+module_init(tpmback_init);
+
+static void __exit tpmback_exit(void)
+{
+	vtpm_release_packets(NULL, 0);
+	tpmif_xenbus_exit();
+	tpmif_interface_exit();
+	misc_deregister(&vtpms_miscdevice);
+}
+module_exit(tpmback_exit)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vtpm");
diff --git a/drivers/xen/tpmback/xenbus.c b/drivers/xen/tpmback/xenbus.c
new file mode 100644
index 0000000..5132c4b
--- /dev/null
+++ b/drivers/xen/tpmback/xenbus.c
@@ -0,0 +1,268 @@
+/*  Xenbus code for tpmif backend
+    Copyright (C) 2005 IBM Corporation
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <stdarg.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+static void maybe_connect(struct backend_info *be);
+static void connect(struct backend_info *be);
+static int connect_ring(struct backend_info *be);
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len);
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state);
+
+long int tpmback_get_instance(struct backend_info *bi)
+{
+	long int res = -1;
+	if (bi && bi->is_instance_set)
+		res = bi->instance;
+	return res;
+}
+
+static int tpmback_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	if (!be) return 0;
+
+	if (be->backend_watch.node) {
+		unregister_xenbus_watch(&be->backend_watch);
+		kfree(be->backend_watch.node);
+		be->backend_watch.node = NULL;
+	}
+	if (be->tpmif) {
+		be->tpmif->bi = NULL;
+		vtpm_release_packets(be->tpmif, 0);
+		tpmif_put(be->tpmif);
+		be->tpmif = NULL;
+	}
+	kfree(be);
+	dev_set_drvdata(&dev->dev, NULL);
+	return 0;
+}
+
+static int tpmback_probe(struct xenbus_device *dev,
+			 const struct xenbus_device_id *id)
+{
+	int err;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+
+	be->is_instance_set = 0;
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+
+	err = xenbus_watch_path2(dev, dev->nodename,
+				 "instance", &be->backend_watch,
+				 backend_changed);
+	if (err) {
+		goto fail;
+	}
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err) {
+		goto fail;
+	}
+	return 0;
+fail:
+	tpmback_remove(dev);
+	return err;
+}
+
+
+static void backend_changed(struct xenbus_watch *watch,
+			    const char **vec, unsigned int len)
+{
+	int err;
+	long instance;
+	struct backend_info *be
+		= container_of(watch, struct backend_info, backend_watch);
+	struct xenbus_device *dev = be->dev;
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename,
+			   "instance","%li", &instance);
+	if (XENBUS_EXIST_ERR(err)) {
+		return;
+	}
+
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading instance");
+		return;
+	}
+
+	if (be->is_instance_set == 0) {
+		be->instance = instance;
+		be->is_instance_set = 1;
+	}
+}
+
+
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	int err;
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+		break;
+
+	case XenbusStateConnected:
+		err = connect_ring(be);
+		if (err) {
+			return;
+		}
+		maybe_connect(be);
+		break;
+
+	case XenbusStateClosing:
+		be->instance = -1;
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateUnknown: /* keep it here */
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		device_unregister(&be->dev->dev);
+		tpmback_remove(dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL,
+				 "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+
+
+static void maybe_connect(struct backend_info *be)
+{
+	if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
+		return;
+
+	connect(be);
+}
+
+
+static void connect(struct backend_info *be)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = be->dev;
+	unsigned long ready = 1;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(be->dev, err, "starting transaction");
+		return;
+	}
+
+	err = xenbus_printf(xbt, be->dev->nodename,
+			    "ready", "%lu", ready);
+	if (err) {
+		xenbus_dev_fatal(be->dev, err, "writing 'ready'");
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		xenbus_dev_fatal(be->dev, err, "end of transaction");
+
+	err = xenbus_switch_state(dev, XenbusStateConnected);
+	if (!err)
+		be->tpmif->status = CONNECTED;
+	return;
+abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	unsigned int ring_ref, evtchn;
+	int err;
+
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			    "ring-ref", "%u", &ring_ref,
+			    "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_error(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	if (!be->tpmif) {
+		be->tpmif = tpmif_find(dev->otherend_id, be);
+		if (IS_ERR(be->tpmif)) {
+			err = PTR_ERR(be->tpmif);
+			be->tpmif = NULL;
+			xenbus_dev_fatal(dev,err,"creating vtpm interface");
+			return err;
+		}
+	}
+
+	if (be->tpmif != NULL) {
+		err = tpmif_map(be->tpmif, ring_ref, evtchn);
+		if (err) {
+			xenbus_dev_error(dev, err,
+					 "mapping shared-frame %u port %u",
+					 ring_ref, evtchn);
+			return err;
+		}
+	}
+	return 0;
+}
+
+
+static const struct xenbus_device_id tpmback_ids[] = {
+	{ "vtpm" },
+	{ "" }
+};
+
+static DEFINE_XENBUS_DRIVER(tpmback, ,
+	.probe = tpmback_probe,
+	.remove = tpmback_remove,
+	.otherend_changed = frontend_changed,
+);
+
+
+int tpmif_xenbus_init(void)
+{
+	return xenbus_register_backend(&tpmback_driver);
+}
+
+void tpmif_xenbus_exit(void)
+{
+	xenbus_unregister_driver(&tpmback_driver);
+}
diff --git a/drivers/xen/usbback/Makefile b/drivers/xen/usbback/Makefile
new file mode 100644
index 0000000..a7548cb
--- /dev/null
+++ b/drivers/xen/usbback/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_USB_BACKEND) := usbbk.o
+
+usbbk-y   := usbstub.o xenbus.o interface.o usbback.o
+
diff --git a/drivers/xen/usbback/interface.c b/drivers/xen/usbback/interface.c
new file mode 100644
index 0000000..bd22277
--- /dev/null
+++ b/drivers/xen/usbback/interface.c
@@ -0,0 +1,190 @@
+/*
+ * interface.c
+ *
+ * Xen USB backend interface management.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/vmalloc.h>
+#include "usbback.h"
+#include <xen/evtchn.h>
+
+static LIST_HEAD(usbif_list);
+static DEFINE_SPINLOCK(usbif_list_lock);
+
+usbif_t *find_usbif(domid_t domid, unsigned int handle)
+{
+	usbif_t *usbif;
+	int found = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&usbif_list_lock, flags);
+	list_for_each_entry(usbif, &usbif_list, usbif_list) {
+		if (usbif->domid == domid
+			&& usbif->handle == handle) {
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&usbif_list_lock, flags);
+
+	if (found)
+		return usbif;
+
+	return NULL;
+}
+
+usbif_t *usbif_alloc(domid_t domid, unsigned int handle)
+{
+	usbif_t *usbif;
+	unsigned long flags;
+	int i;
+
+	usbif = kzalloc(sizeof(usbif_t), GFP_KERNEL);
+	if (!usbif)
+		return NULL;
+
+	usbif->domid = domid;
+	usbif->handle = handle;
+	spin_lock_init(&usbif->urb_ring_lock);
+	spin_lock_init(&usbif->conn_ring_lock);
+	atomic_set(&usbif->refcnt, 0);
+	init_waitqueue_head(&usbif->wq);
+	init_waitqueue_head(&usbif->waiting_to_free);
+	spin_lock_init(&usbif->stub_lock);
+	INIT_LIST_HEAD(&usbif->stub_list);
+	spin_lock_init(&usbif->addr_lock);
+	for (i = 0; i < USB_DEV_ADDR_SIZE; i++)
+		usbif->addr_table[i] = NULL;
+
+	spin_lock_irqsave(&usbif_list_lock, flags);
+	list_add(&usbif->usbif_list, &usbif_list);
+	spin_unlock_irqrestore(&usbif_list_lock, flags);
+
+	return usbif;
+}
+
+int usbif_map(usbif_t *usbif, grant_ref_t urb_ring_ref,
+	      grant_ref_t conn_ring_ref, evtchn_port_t evtchn)
+{
+	int err = -ENOMEM;
+	struct vm_struct *area;
+	usbif_urb_sring_t *urb_sring;
+	usbif_conn_sring_t *conn_sring;
+
+	if (usbif->irq)
+		return 0;
+
+	area = xenbus_map_ring_valloc(usbif->xbdev, urb_ring_ref);
+	if (IS_ERR(area))
+		return PTR_ERR(area);
+	usbif->urb_ring_area = area;
+	area = xenbus_map_ring_valloc(usbif->xbdev, conn_ring_ref);
+	if (IS_ERR(area)) {
+		err = PTR_ERR(area);
+		goto fail_alloc;
+	}
+	usbif->conn_ring_area = area;
+
+	err = bind_interdomain_evtchn_to_irqhandler(
+			usbif->domid, evtchn, usbbk_be_int, 0,
+			"usbif-backend", usbif);
+	if (err < 0)
+		goto fail_evtchn;
+	usbif->irq = err;
+
+	urb_sring = (usbif_urb_sring_t *) usbif->urb_ring_area->addr;
+	BACK_RING_INIT(&usbif->urb_ring, urb_sring, PAGE_SIZE);
+
+	conn_sring = (usbif_conn_sring_t *) usbif->conn_ring_area->addr;
+	BACK_RING_INIT(&usbif->conn_ring, conn_sring, PAGE_SIZE);
+
+	return 0;
+
+fail_evtchn:
+	xenbus_unmap_ring_vfree(usbif->xbdev, usbif->conn_ring_area);
+fail_alloc:
+	xenbus_unmap_ring_vfree(usbif->xbdev, usbif->urb_ring_area);
+
+	return err;
+}
+
+void usbif_disconnect(usbif_t *usbif)
+{
+	struct usbstub *stub, *tmp;
+	unsigned long flags;
+
+	if (usbif->xenusbd) {
+		kthread_stop(usbif->xenusbd);
+		usbif->xenusbd = NULL;
+	}
+
+	spin_lock_irqsave(&usbif->stub_lock, flags);
+	list_for_each_entry_safe(stub, tmp, &usbif->stub_list, dev_list) {
+		usbbk_unlink_urbs(stub);
+		detach_device_without_lock(usbif, stub);
+	}
+	spin_unlock_irqrestore(&usbif->stub_lock, flags);
+
+	wait_event(usbif->waiting_to_free, atomic_read(&usbif->refcnt) == 0);
+
+	if (usbif->irq) {
+		unbind_from_irqhandler(usbif->irq, usbif);
+		usbif->irq = 0;
+	}
+
+	if (usbif->urb_ring.sring) {
+		xenbus_unmap_ring_vfree(usbif->xbdev, usbif->urb_ring_area);
+		xenbus_unmap_ring_vfree(usbif->xbdev, usbif->conn_ring_area);
+		usbif->urb_ring.sring = NULL;
+		usbif->conn_ring.sring = NULL;
+	}
+}
+
+void usbif_free(usbif_t *usbif)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&usbif_list_lock, flags);
+	list_del(&usbif->usbif_list);
+	spin_unlock_irqrestore(&usbif_list_lock, flags);
+	kfree(usbif);
+}
diff --git a/drivers/xen/usbback/usbback.c b/drivers/xen/usbback/usbback.c
new file mode 100644
index 0000000..b604f5d
--- /dev/null
+++ b/drivers/xen/usbback/usbback.c
@@ -0,0 +1,1198 @@
+/*
+ * usbback.c
+ *
+ * Xen USB backend driver
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include "usbback.h"
+
+#if 0
+#include "../../usb/core/hub.h"
+#endif
+
+int usbif_reqs = USBIF_BACK_MAX_PENDING_REQS;
+module_param_named(reqs, usbif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of usbback requests to allocate");
+
+struct pending_req_segment {
+	uint16_t offset;
+	uint16_t length;
+};
+
+typedef struct {
+	usbif_t *usbif;
+
+	uint16_t id; /* request id */
+
+	struct usbstub *stub;
+	struct list_head urb_list;
+
+	/* urb */
+	struct urb *urb;
+	void *buffer;
+	dma_addr_t transfer_dma;
+	struct usb_ctrlrequest *setup;
+
+	/* request segments */
+	uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */
+	uint16_t nr_extra_segs; /* number of iso_frame_desc segments (ISO) */
+	struct pending_req_segment *seg;
+
+	struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static LIST_HEAD(pending_urb_free);
+static DEFINE_SPINLOCK(urb_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define USBBACK_INVALID_HANDLE (~0)
+
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - pending_reqs) * USBIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+	unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+static pending_req_t *alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+static inline void add_req_to_submitting_list(struct usbstub *stub, pending_req_t *pending_req)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stub->submitting_lock, flags);
+	list_add_tail(&pending_req->urb_list, &stub->submitting_list);
+	spin_unlock_irqrestore(&stub->submitting_lock, flags);
+}
+
+static inline void remove_req_from_submitting_list(struct usbstub *stub, pending_req_t *pending_req)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stub->submitting_lock, flags);
+	list_del_init(&pending_req->urb_list);
+	spin_unlock_irqrestore(&stub->submitting_lock, flags);
+}
+
+void usbbk_unlink_urbs(struct usbstub *stub)
+{
+	pending_req_t *req, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&stub->submitting_lock, flags);
+	list_for_each_entry_safe(req, tmp, &stub->submitting_list, urb_list) {
+		usb_unlink_urb(req->urb);
+	}
+	spin_unlock_irqrestore(&stub->submitting_lock, flags);
+}
+
+static void fast_flush_area(pending_req_t *pending_req)
+{
+	struct gnttab_unmap_grant_ref unmap[USBIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int i, nr_segs, invcount = 0;
+	grant_handle_t handle;
+	int ret;
+
+	nr_segs = pending_req->nr_buffer_segs + pending_req->nr_extra_segs;
+
+	if (nr_segs) {
+		for (i = 0; i < nr_segs; i++) {
+			handle = pending_handle(pending_req, i);
+			if (handle == USBBACK_INVALID_HANDLE)
+				continue;
+			gnttab_set_unmap_op(&unmap[invcount], vaddr(pending_req, i),
+					    GNTMAP_host_map, handle);
+			pending_handle(pending_req, i) = USBBACK_INVALID_HANDLE;
+			invcount++;
+		}
+
+		ret = HYPERVISOR_grant_table_op(
+			GNTTABOP_unmap_grant_ref, unmap, invcount);
+		BUG_ON(ret);
+
+		kfree(pending_req->seg);
+	}
+
+	return;
+}
+
+static void copy_buff_to_pages(void *buff, pending_req_t *pending_req,
+		int start, int nr_pages)
+{
+	unsigned long copied = 0;
+	int i;
+
+	for (i = start; i < start + nr_pages; i++) {
+		memcpy((void *) vaddr(pending_req, i) + pending_req->seg[i].offset,
+			buff + copied,
+			pending_req->seg[i].length);
+		copied += pending_req->seg[i].length;
+	}
+}
+
+static void copy_pages_to_buff(void *buff, pending_req_t *pending_req,
+		int start, int nr_pages)
+{
+	unsigned long copied = 0;
+	int i;
+
+	for (i = start; i < start + nr_pages; i++) {
+		memcpy(buff + copied,
+			(void *) vaddr(pending_req, i) + pending_req->seg[i].offset,
+			pending_req->seg[i].length);
+		copied += pending_req->seg[i].length;
+	}
+}
+
+static int usbbk_alloc_urb(usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+	int ret;
+
+	if (usb_pipeisoc(req->pipe))
+		pending_req->urb = usb_alloc_urb(req->u.isoc.number_of_packets, GFP_KERNEL);
+	else
+		pending_req->urb = usb_alloc_urb(0, GFP_KERNEL);
+	if (!pending_req->urb) {
+		pr_err("usbback: can't alloc urb\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	if (req->buffer_length) {
+		pending_req->buffer = usb_alloc_coherent(pending_req->stub->udev,
+				req->buffer_length, GFP_KERNEL,
+				&pending_req->transfer_dma);
+		if (!pending_req->buffer) {
+			pr_err("usbback: can't alloc urb buffer\n");
+			ret = -ENOMEM;
+			goto fail_free_urb;
+		}
+	}
+
+	if (usb_pipecontrol(req->pipe)) {
+		pending_req->setup = kmalloc(sizeof(struct usb_ctrlrequest),
+					     GFP_KERNEL);
+		if (!pending_req->setup) {
+			pr_err("usbback: can't alloc usb_ctrlrequest\n");
+			ret = -ENOMEM;
+			goto fail_free_buffer;
+		}
+	}
+
+	return 0;
+
+fail_free_buffer:
+	if (req->buffer_length)
+		usb_free_coherent(pending_req->stub->udev,
+				  req->buffer_length,
+				  pending_req->buffer,
+				  pending_req->transfer_dma);
+fail_free_urb:
+	usb_free_urb(pending_req->urb);
+fail:
+	return ret;
+}
+
+static void usbbk_free_urb(struct urb *urb)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&urb_free_lock, flags);
+	list_add(&urb->urb_list, &pending_urb_free);
+	spin_unlock_irqrestore(&urb_free_lock, flags);
+}
+
+static void _usbbk_free_urb(struct urb *urb)
+{
+	if (usb_pipecontrol(urb->pipe))
+		kfree(urb->setup_packet);
+	if (urb->transfer_buffer_length)
+		usb_free_coherent(urb->dev, urb->transfer_buffer_length,
+				  urb->transfer_buffer, urb->transfer_dma);
+	barrier();
+	usb_free_urb(urb);
+}
+
+static void usbbk_free_urbs(void)
+{
+	unsigned long flags;
+	struct list_head tmp_list;
+
+	if (list_empty(&pending_urb_free))
+		return;
+
+	INIT_LIST_HEAD(&tmp_list);
+
+	spin_lock_irqsave(&urb_free_lock, flags);
+	list_splice_init(&pending_urb_free, &tmp_list);
+	spin_unlock_irqrestore(&urb_free_lock, flags);
+
+	while (!list_empty(&tmp_list)) {
+		struct urb *next_urb = list_first_entry(&tmp_list, struct urb,
+							urb_list);
+
+		list_del(&next_urb->urb_list);
+		_usbbk_free_urb(next_urb);
+	}
+}
+
+static void usbbk_notify_work(usbif_t *usbif)
+{
+	usbif->waiting_reqs = 1;
+	wake_up(&usbif->wq);
+}
+
+irqreturn_t usbbk_be_int(int irq, void *dev_id)
+{
+	usbbk_notify_work(dev_id);
+	return IRQ_HANDLED;
+}
+
+static void usbbk_do_response(pending_req_t *pending_req, int32_t status,
+					int32_t actual_length, int32_t error_count, uint16_t start_frame)
+{
+	usbif_t *usbif = pending_req->usbif;
+	usbif_urb_response_t *res;
+	unsigned long flags;
+	int notify;
+
+	spin_lock_irqsave(&usbif->urb_ring_lock, flags);
+	res = RING_GET_RESPONSE(&usbif->urb_ring, usbif->urb_ring.rsp_prod_pvt);
+	res->id = pending_req->id;
+	res->status = status;
+	res->actual_length = actual_length;
+	res->error_count = error_count;
+	res->start_frame = start_frame;
+	usbif->urb_ring.rsp_prod_pvt++;
+	barrier();
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&usbif->urb_ring, notify);
+	spin_unlock_irqrestore(&usbif->urb_ring_lock, flags);
+
+	if (notify)
+		notify_remote_via_irq(usbif->irq);
+}
+
+static void usbbk_urb_complete(struct urb *urb)
+{
+	pending_req_t *pending_req = (pending_req_t *)urb->context;
+
+	if (usb_pipein(urb->pipe) && urb->status == 0 && urb->actual_length > 0)
+		copy_buff_to_pages(pending_req->buffer, pending_req,
+					0, pending_req->nr_buffer_segs);
+
+	if (usb_pipeisoc(urb->pipe))
+		copy_buff_to_pages(&urb->iso_frame_desc[0], pending_req,
+					pending_req->nr_buffer_segs, pending_req->nr_extra_segs);
+
+	barrier();
+
+	fast_flush_area(pending_req);
+
+	usbbk_do_response(pending_req, urb->status, urb->actual_length,
+					urb->error_count, urb->start_frame);
+
+	remove_req_from_submitting_list(pending_req->stub, pending_req);
+
+	barrier();
+	usbbk_free_urb(urb);
+	usbif_put(pending_req->usbif);
+	free_req(pending_req);
+}
+
+static int usbbk_gnttab_map(usbif_t *usbif,
+			usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+	int i, ret;
+	unsigned int nr_segs;
+	uint32_t flags;
+	struct gnttab_map_grant_ref map[USBIF_MAX_SEGMENTS_PER_REQUEST];
+
+	nr_segs = pending_req->nr_buffer_segs + pending_req->nr_extra_segs;
+
+	if (nr_segs > USBIF_MAX_SEGMENTS_PER_REQUEST) {
+		pr_err("Bad number of segments in request\n");
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	if (nr_segs) {
+		pending_req->seg = kmalloc(sizeof(struct pending_req_segment)
+				* nr_segs, GFP_KERNEL);
+		if (!pending_req->seg) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		if (pending_req->nr_buffer_segs) {
+			flags = GNTMAP_host_map;
+			if (usb_pipeout(req->pipe))
+				flags |= GNTMAP_readonly;
+			for (i = 0; i < pending_req->nr_buffer_segs; i++)
+				gnttab_set_map_op(&map[i], vaddr(
+						pending_req, i), flags,
+						req->seg[i].gref,
+						usbif->domid);
+		}
+
+		if (pending_req->nr_extra_segs) {
+			flags = GNTMAP_host_map;
+			for (i = req->nr_buffer_segs; i < nr_segs; i++)
+				gnttab_set_map_op(&map[i], vaddr(
+						pending_req, i), flags,
+						req->seg[i].gref,
+						usbif->domid);
+		}
+
+		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+					map, nr_segs);
+		BUG_ON(ret);
+
+		for (i = 0; i < nr_segs; i++) {
+			/* Make sure than none of the map ops failed with GNTST_eagain */
+			if (unlikely(map[i].status == GNTST_eagain))
+				gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
+
+			if (unlikely(map[i].status != GNTST_okay)) {
+				pr_err("usbback: invalid buffer -- could not remap it\n");
+				map[i].handle = USBBACK_INVALID_HANDLE;
+				ret |= 1;
+			}
+
+			pending_handle(pending_req, i) = map[i].handle;
+
+			if (ret)
+				continue;
+
+			set_phys_to_machine(__pa(vaddr(
+				pending_req, i)) >> PAGE_SHIFT,
+				FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+
+			pending_req->seg[i].offset = req->seg[i].offset;
+			pending_req->seg[i].length = req->seg[i].length;
+
+			barrier();
+
+			if (pending_req->seg[i].offset >= PAGE_SIZE ||
+					pending_req->seg[i].length > PAGE_SIZE ||
+					pending_req->seg[i].offset + pending_req->seg[i].length > PAGE_SIZE)
+					ret |= 1;
+		}
+
+		if (ret)
+			goto fail_flush;
+	}
+
+	return 0;
+
+fail_flush:
+	fast_flush_area(pending_req);
+	ret = -ENOMEM;
+
+fail:
+	return ret;
+}
+
+static void usbbk_init_urb(usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+	unsigned int pipe;
+	struct usb_device *udev = pending_req->stub->udev;
+	struct urb *urb = pending_req->urb;
+
+	switch (usb_pipetype(req->pipe)) {
+	case PIPE_ISOCHRONOUS:
+		if (usb_pipein(req->pipe))
+			pipe = usb_rcvisocpipe(udev, usb_pipeendpoint(req->pipe));
+		else
+			pipe = usb_sndisocpipe(udev, usb_pipeendpoint(req->pipe));
+
+		urb->dev = udev;
+		urb->pipe = pipe;
+		urb->transfer_flags = req->transfer_flags;
+		urb->transfer_flags |= URB_ISO_ASAP;
+		urb->transfer_buffer = pending_req->buffer;
+		urb->transfer_buffer_length = req->buffer_length;
+		urb->complete = usbbk_urb_complete;
+		urb->context = pending_req;
+		urb->interval = req->u.isoc.interval;
+		urb->start_frame = req->u.isoc.start_frame;
+		urb->number_of_packets = req->u.isoc.number_of_packets;
+
+		break;
+	case PIPE_INTERRUPT:
+		if (usb_pipein(req->pipe))
+			pipe = usb_rcvintpipe(udev, usb_pipeendpoint(req->pipe));
+		else
+			pipe = usb_sndintpipe(udev, usb_pipeendpoint(req->pipe));
+
+		usb_fill_int_urb(urb, udev, pipe,
+				pending_req->buffer, req->buffer_length,
+				usbbk_urb_complete,
+				pending_req, req->u.intr.interval);
+		/*
+		 * high speed interrupt endpoints use a logarithmic encoding of
+		 * the endpoint interval, and usb_fill_int_urb() initializes a
+		 * interrupt urb with the encoded interval value.
+		 *
+		 * req->u.intr.interval is the interval value that already
+		 * encoded in the frontend part, and the above usb_fill_int_urb()
+		 * initializes the urb->interval with double encoded value.
+		 *
+		 * so, simply overwrite the urb->interval with original value.
+		 */
+		urb->interval = req->u.intr.interval;
+		urb->transfer_flags = req->transfer_flags;
+
+		break;
+	case PIPE_CONTROL:
+		if (usb_pipein(req->pipe))
+			pipe = usb_rcvctrlpipe(udev, 0);
+		else
+			pipe = usb_sndctrlpipe(udev, 0);
+
+		usb_fill_control_urb(urb, udev, pipe,
+				(unsigned char *) pending_req->setup,
+				pending_req->buffer, req->buffer_length,
+				usbbk_urb_complete, pending_req);
+		memcpy(pending_req->setup, req->u.ctrl, 8);
+		urb->transfer_flags = req->transfer_flags;
+
+		break;
+	case PIPE_BULK:
+		if (usb_pipein(req->pipe))
+			pipe = usb_rcvbulkpipe(udev, usb_pipeendpoint(req->pipe));
+		else
+			pipe = usb_sndbulkpipe(udev, usb_pipeendpoint(req->pipe));
+
+		usb_fill_bulk_urb(urb, udev, pipe,
+				pending_req->buffer, req->buffer_length,
+				usbbk_urb_complete, pending_req);
+		urb->transfer_flags = req->transfer_flags;
+
+		break;
+	default:
+		break;
+	}
+
+	if (req->buffer_length) {
+		urb->transfer_dma = pending_req->transfer_dma;
+		urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+	}
+}
+
+struct set_interface_request {
+	pending_req_t *pending_req;
+	int interface;
+	int alternate;
+	struct work_struct work;
+};
+
+static void usbbk_set_interface_work(struct work_struct *arg)
+{
+	struct set_interface_request *req
+		= container_of(arg, struct set_interface_request, work);
+	pending_req_t *pending_req = req->pending_req;
+	struct usb_device *udev = req->pending_req->stub->udev;
+
+	int ret;
+
+	usb_lock_device(udev);
+	ret = usb_set_interface(udev, req->interface, req->alternate);
+	usb_unlock_device(udev);
+	usb_put_dev(udev);
+
+	usbbk_do_response(pending_req, ret, 0, 0, 0);
+	usbif_put(pending_req->usbif);
+	free_req(pending_req);
+	kfree(req);
+}
+
+static int usbbk_set_interface(pending_req_t *pending_req, int interface, int alternate)
+{
+	struct set_interface_request *req;
+	struct usb_device *udev = pending_req->stub->udev;
+
+	req = kmalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+	req->pending_req = pending_req;
+	req->interface = interface;
+	req->alternate = alternate;
+	INIT_WORK(&req->work, usbbk_set_interface_work);
+	usb_get_dev(udev);
+	schedule_work(&req->work);
+	return 0;
+}
+
+struct clear_halt_request {
+	pending_req_t *pending_req;
+	int pipe;
+	struct work_struct work;
+};
+
+static void usbbk_clear_halt_work(struct work_struct *arg)
+{
+	struct clear_halt_request *req
+		= container_of(arg, struct clear_halt_request, work);
+	pending_req_t *pending_req = req->pending_req;
+	struct usb_device *udev = req->pending_req->stub->udev;
+	int ret;
+
+	usb_lock_device(udev);
+	ret = usb_clear_halt(req->pending_req->stub->udev, req->pipe);
+	usb_unlock_device(udev);
+	usb_put_dev(udev);
+
+	usbbk_do_response(pending_req, ret, 0, 0, 0);
+	usbif_put(pending_req->usbif);
+	free_req(pending_req);
+	kfree(req);
+}
+
+static int usbbk_clear_halt(pending_req_t *pending_req, int pipe)
+{
+	struct clear_halt_request *req;
+	struct usb_device *udev = pending_req->stub->udev;
+
+	req = kmalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+	req->pending_req = pending_req;
+	req->pipe = pipe;
+	INIT_WORK(&req->work, usbbk_clear_halt_work);
+
+	usb_get_dev(udev);
+	schedule_work(&req->work);
+	return 0;
+}
+
+#if 0
+struct port_reset_request {
+	pending_req_t *pending_req;
+	struct work_struct work;
+};
+
+static void usbbk_port_reset_work(struct work_struct *arg)
+{
+	struct port_reset_request *req
+		= container_of(arg, struct port_reset_request, work);
+	pending_req_t *pending_req = req->pending_req;
+	struct usb_device *udev = pending_req->stub->udev;
+	int ret, ret_lock;
+
+	ret = ret_lock = usb_lock_device_for_reset(udev, NULL);
+	if (ret_lock >= 0) {
+		ret = usb_reset_device(udev);
+		if (ret_lock)
+			usb_unlock_device(udev);
+	}
+	usb_put_dev(udev);
+
+	usbbk_do_response(pending_req, ret, 0, 0, 0);
+	usbif_put(pending_req->usbif);
+	free_req(pending_req);
+	kfree(req);
+}
+
+static int usbbk_port_reset(pending_req_t *pending_req)
+{
+	struct port_reset_request *req;
+	struct usb_device *udev = pending_req->stub->udev;
+
+	req = kmalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->pending_req = pending_req;
+	INIT_WORK(&req->work, usbbk_port_reset_work);
+
+	usb_get_dev(udev);
+	schedule_work(&req->work);
+	return 0;
+}
+#endif
+
+static void usbbk_set_address(usbif_t *usbif, struct usbstub *stub, int cur_addr, int new_addr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&usbif->addr_lock, flags);
+	if (cur_addr)
+		usbif->addr_table[cur_addr] = NULL;
+	if (new_addr)
+		usbif->addr_table[new_addr] = stub;
+	stub->addr = new_addr;
+	spin_unlock_irqrestore(&usbif->addr_lock, flags);
+}
+
+struct usbstub *find_attached_device(usbif_t *usbif, int portnum)
+{
+	struct usbstub *stub;
+	int found = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&usbif->stub_lock, flags);
+	list_for_each_entry(stub, &usbif->stub_list, dev_list) {
+		if (stub->portid->portnum == portnum) {
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&usbif->stub_lock, flags);
+
+	if (found)
+		return stub;
+
+	return NULL;
+}
+
+static void process_unlink_req(usbif_t *usbif,
+		usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+	pending_req_t *unlink_req = NULL;
+	int devnum;
+	int ret = 0;
+	unsigned long flags;
+
+	devnum = usb_pipedevice(req->pipe);
+	if (unlikely(devnum == 0)) {
+		pending_req->stub = find_attached_device(usbif, usbif_pipeportnum(req->pipe));
+		if (unlikely(!pending_req->stub)) {
+			ret = -ENODEV;
+			goto fail_response;
+		}
+	} else {
+		if (unlikely(!usbif->addr_table[devnum])) {
+			ret = -ENODEV;
+			goto fail_response;
+		}
+		pending_req->stub = usbif->addr_table[devnum];
+	}
+
+	spin_lock_irqsave(&pending_req->stub->submitting_lock, flags);
+	list_for_each_entry(unlink_req, &pending_req->stub->submitting_list, urb_list) {
+		if (unlink_req->id == req->u.unlink.unlink_id) {
+			ret = usb_unlink_urb(unlink_req->urb);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&pending_req->stub->submitting_lock, flags);
+
+fail_response:
+	usbbk_do_response(pending_req, ret, 0, 0, 0);
+	usbif_put(usbif);
+	free_req(pending_req);
+	return;
+}
+
+static int check_and_submit_special_ctrlreq(usbif_t *usbif,
+		usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+	int devnum;
+	struct usbstub *stub = NULL;
+	struct usb_ctrlrequest *ctrl = (struct usb_ctrlrequest *) req->u.ctrl;
+	int ret;
+	int done = 0;
+
+	devnum = usb_pipedevice(req->pipe);
+
+	/*
+	 * When the device is first connected or reseted, USB device has no address.
+	 * In this initial state, following requests are send to device address (#0),
+	 *
+	 *  1. GET_DESCRIPTOR (with Descriptor Type is "DEVICE") is send,
+	 *     and OS knows what device is connected to.
+	 *
+	 *  2. SET_ADDRESS is send, and then, device has its address.
+	 *
+	 * In the next step, SET_CONFIGURATION is send to addressed device, and then,
+	 * the device is finally ready to use.
+	 */
+	if (unlikely(devnum == 0)) {
+		stub = find_attached_device(usbif, usbif_pipeportnum(req->pipe));
+		if (unlikely(!stub)) {
+			ret = -ENODEV;
+			goto fail_response;
+		}
+
+		switch (ctrl->bRequest) {
+		case USB_REQ_GET_DESCRIPTOR:
+			/*
+			 * GET_DESCRIPTOR request to device #0.
+			 * through to normal urb transfer.
+			 */
+			pending_req->stub = stub;
+			return 0;
+			break;
+		case USB_REQ_SET_ADDRESS:
+			/*
+			 * SET_ADDRESS request to device #0.
+			 * add attached device to addr_table.
+			 */
+			{
+				__u16 addr = le16_to_cpu(ctrl->wValue);
+				usbbk_set_address(usbif, stub, 0, addr);
+			}
+			ret = 0;
+			goto fail_response;
+			break;
+		default:
+			ret = -EINVAL;
+			goto fail_response;
+		}
+	} else {
+		if (unlikely(!usbif->addr_table[devnum])) {
+			ret = -ENODEV;
+			goto fail_response;
+		}
+		pending_req->stub = usbif->addr_table[devnum];
+	}
+
+	/*
+	 * Check special request
+	 */
+	switch (ctrl->bRequest) {
+	case USB_REQ_SET_ADDRESS:
+		/*
+		 * SET_ADDRESS request to addressed device.
+		 * change addr or remove from addr_table.
+		 */
+		{
+			__u16 addr = le16_to_cpu(ctrl->wValue);
+			usbbk_set_address(usbif, stub, devnum, addr);
+		}
+		ret = 0;
+		goto fail_response;
+		break;
+#if 0
+	case USB_REQ_SET_CONFIGURATION:
+		/*
+		 * linux 2.6.27 or later version only!
+		 */
+		if (ctrl->RequestType == USB_RECIP_DEVICE) {
+			__u16 config = le16_to_cpu(ctrl->wValue);
+			usb_driver_set_configuration(pending_req->stub->udev, config);
+			done = 1;
+		}
+		break;
+#endif
+	case USB_REQ_SET_INTERFACE:
+		if (ctrl->bRequestType == USB_RECIP_INTERFACE) {
+			__u16 alt = le16_to_cpu(ctrl->wValue);
+			__u16 intf = le16_to_cpu(ctrl->wIndex);
+			usbbk_set_interface(pending_req, intf, alt);
+			done = 1;
+		}
+		break;
+	case USB_REQ_CLEAR_FEATURE:
+		if (ctrl->bRequestType == USB_RECIP_ENDPOINT
+			&& ctrl->wValue == USB_ENDPOINT_HALT) {
+			int pipe;
+			int ep = le16_to_cpu(ctrl->wIndex) & 0x0f;
+			int dir = le16_to_cpu(ctrl->wIndex)
+					& USB_DIR_IN;
+			if (dir)
+				pipe = usb_rcvctrlpipe(pending_req->stub->udev, ep);
+			else
+				pipe = usb_sndctrlpipe(pending_req->stub->udev, ep);
+			usbbk_clear_halt(pending_req, pipe);
+			done = 1;
+		}
+		break;
+#if 0 /* not tested yet */
+	case USB_REQ_SET_FEATURE:
+		if (ctrl->bRequestType == USB_RT_PORT) {
+			__u16 feat = le16_to_cpu(ctrl->wValue);
+			if (feat == USB_PORT_FEAT_RESET) {
+				usbbk_port_reset(pending_req);
+				done = 1;
+			}
+		}
+		break;
+#endif
+	default:
+		break;
+	}
+
+	return done;
+
+fail_response:
+	usbbk_do_response(pending_req, ret, 0, 0, 0);
+	usbif_put(usbif);
+	free_req(pending_req);
+	return 1;
+}
+
+static void dispatch_request_to_pending_reqs(usbif_t *usbif,
+		usbif_urb_request_t *req,
+		pending_req_t *pending_req)
+{
+	int ret;
+
+	pending_req->id = req->id;
+	pending_req->usbif = usbif;
+
+	barrier();
+
+	usbif_get(usbif);
+
+	/* unlink request */
+	if (unlikely(usbif_pipeunlink(req->pipe))) {
+		process_unlink_req(usbif, req, pending_req);
+		return;
+	}
+
+	if (usb_pipecontrol(req->pipe)) {
+		if (check_and_submit_special_ctrlreq(usbif, req, pending_req))
+			return;
+	} else {
+		int devnum = usb_pipedevice(req->pipe);
+		if (unlikely(!usbif->addr_table[devnum])) {
+			ret = -ENODEV;
+			goto fail_response;
+		}
+		pending_req->stub = usbif->addr_table[devnum];
+	}
+
+	barrier();
+
+	ret = usbbk_alloc_urb(req, pending_req);
+	if (ret) {
+		ret = -ESHUTDOWN;
+		goto fail_response;
+	}
+
+	add_req_to_submitting_list(pending_req->stub, pending_req);
+
+	barrier();
+
+	usbbk_init_urb(req, pending_req);
+
+	barrier();
+
+	pending_req->nr_buffer_segs = req->nr_buffer_segs;
+	if (usb_pipeisoc(req->pipe))
+		pending_req->nr_extra_segs = req->u.isoc.nr_frame_desc_segs;
+	else
+		pending_req->nr_extra_segs = 0;
+
+	barrier();
+
+	ret = usbbk_gnttab_map(usbif, req, pending_req);
+	if (ret) {
+		pr_err("usbback: invalid buffer\n");
+		ret = -ESHUTDOWN;
+		goto fail_free_urb;
+	}
+
+	barrier();
+
+	if (usb_pipeout(req->pipe) && req->buffer_length)
+		copy_pages_to_buff(pending_req->buffer,
+					pending_req,
+					0,
+					pending_req->nr_buffer_segs);
+	if (usb_pipeisoc(req->pipe)) {
+		copy_pages_to_buff(&pending_req->urb->iso_frame_desc[0],
+			pending_req,
+			pending_req->nr_buffer_segs,
+			pending_req->nr_extra_segs);
+	}
+
+	barrier();
+
+	ret = usb_submit_urb(pending_req->urb, GFP_KERNEL);
+	if (ret) {
+		pr_err("usbback: failed submitting urb, error %d\n", ret);
+		ret = -ESHUTDOWN;
+		goto fail_flush_area;
+	}
+	return;
+
+fail_flush_area:
+	fast_flush_area(pending_req);
+fail_free_urb:
+	remove_req_from_submitting_list(pending_req->stub, pending_req);
+	barrier();
+	usbbk_free_urb(pending_req->urb);
+fail_response:
+	usbbk_do_response(pending_req, ret, 0, 0, 0);
+	usbif_put(usbif);
+	free_req(pending_req);
+}
+
+static int usbbk_start_submit_urb(usbif_t *usbif)
+{
+	usbif_urb_back_ring_t *urb_ring = &usbif->urb_ring;
+	usbif_urb_request_t *req;
+	pending_req_t *pending_req;
+	RING_IDX rc, rp;
+	int more_to_do = 0;
+
+	rc = urb_ring->req_cons;
+	rp = urb_ring->sring->req_prod;
+	rmb();
+
+	while (rc != rp) {
+		if (RING_REQUEST_CONS_OVERFLOW(urb_ring, rc)) {
+			pr_warning("RING_REQUEST_CONS_OVERFLOW\n");
+			break;
+		}
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			more_to_do = 1;
+			break;
+		}
+
+		req = RING_GET_REQUEST(urb_ring, rc);
+		urb_ring->req_cons = ++rc;
+
+		dispatch_request_to_pending_reqs(usbif, req,
+							pending_req);
+	}
+
+	RING_FINAL_CHECK_FOR_REQUESTS(&usbif->urb_ring, more_to_do);
+
+	cond_resched();
+
+	return more_to_do;
+}
+
+void usbbk_hotplug_notify(usbif_t *usbif, int portnum, int speed)
+{
+	usbif_conn_back_ring_t *ring = &usbif->conn_ring;
+	usbif_conn_request_t *req;
+	usbif_conn_response_t *res;
+	unsigned long flags;
+	u16 id;
+	int notify;
+
+	spin_lock_irqsave(&usbif->conn_ring_lock, flags);
+
+	req = RING_GET_REQUEST(ring, ring->req_cons);;
+	id = req->id;
+	ring->req_cons++;
+	ring->sring->req_event = ring->req_cons + 1;
+
+	res = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt);
+	res->id = id;
+	res->portnum = portnum;
+	res->speed = speed;
+	ring->rsp_prod_pvt++;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(ring, notify);
+
+	spin_unlock_irqrestore(&usbif->conn_ring_lock, flags);
+
+	if (notify)
+		notify_remote_via_irq(usbif->irq);
+}
+
+int usbbk_schedule(void *arg)
+{
+	usbif_t *usbif = (usbif_t *) arg;
+
+	usbif_get(usbif);
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(
+			usbif->wq,
+			usbif->waiting_reqs || kthread_should_stop());
+		wait_event_interruptible(
+			pending_free_wq,
+			!list_empty(&pending_free) || kthread_should_stop());
+		usbif->waiting_reqs = 0;
+		smp_mb();
+
+		if (usbbk_start_submit_urb(usbif))
+			usbif->waiting_reqs = 1;
+
+		usbbk_free_urbs();
+	}
+
+	usbbk_free_urbs();
+	usbif->xenusbd = NULL;
+	usbif_put(usbif);
+
+	return 0;
+}
+
+/*
+ * attach usbstub device to usbif.
+ */
+void usbbk_attach_device(usbif_t *usbif, struct usbstub *stub)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&usbif->stub_lock, flags);
+	list_add(&stub->dev_list, &usbif->stub_list);
+	spin_unlock_irqrestore(&usbif->stub_lock, flags);
+	stub->usbif = usbif;
+}
+
+/*
+ * detach usbstub device from usbif.
+ */
+void usbbk_detach_device(usbif_t *usbif, struct usbstub *stub)
+{
+	unsigned long flags;
+
+	if (stub->addr)
+		usbbk_set_address(usbif, stub, stub->addr, 0);
+	spin_lock_irqsave(&usbif->stub_lock, flags);
+	list_del(&stub->dev_list);
+	spin_unlock_irqrestore(&usbif->stub_lock, flags);
+	stub->usbif = NULL;
+}
+
+void detach_device_without_lock(usbif_t *usbif, struct usbstub *stub)
+{
+	if (stub->addr)
+		usbbk_set_address(usbif, stub, stub->addr, 0);
+	list_del(&stub->dev_list);
+	stub->usbif = NULL;
+}
+
+static int __init usbback_init(void)
+{
+	int i, mmap_pages;
+	int err = 0;
+
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	mmap_pages = usbif_reqs * USBIF_MAX_SEGMENTS_PER_REQUEST;
+	pending_reqs = kzalloc(sizeof(pending_reqs[0]) *
+			usbif_reqs, GFP_KERNEL);
+	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+			mmap_pages, GFP_KERNEL);
+	pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
+
+	if (!pending_reqs || !pending_grant_handles || !pending_pages) {
+		err = -ENOMEM;
+		goto out_mem;
+	}
+
+	for (i = 0; i < mmap_pages; i++)
+		pending_grant_handles[i] = USBBACK_INVALID_HANDLE;
+
+	INIT_LIST_HEAD(&pending_free);
+
+	for (i = 0; i < usbif_reqs; i++)
+		list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+	err = usbstub_init();
+	if (err)
+		goto out_mem;
+
+	err = usbback_xenbus_init();
+	if (err)
+		goto out_xenbus;
+
+	return 0;
+
+out_xenbus:
+	usbstub_exit();
+out_mem:
+	kfree(pending_reqs);
+	kfree(pending_grant_handles);
+	free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+	return err;
+}
+
+static void __exit usbback_exit(void)
+{
+	usbback_xenbus_exit();
+	usbstub_exit();
+	kfree(pending_reqs);
+	kfree(pending_grant_handles);
+	free_empty_pages_and_pagevec(pending_pages, usbif_reqs * USBIF_MAX_SEGMENTS_PER_REQUEST);
+}
+
+module_init(usbback_init);
+module_exit(usbback_exit);
+
+MODULE_AUTHOR("");
+MODULE_DESCRIPTION("Xen USB backend driver (usbback)");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vusb");
diff --git a/drivers/xen/usbback/usbback.h b/drivers/xen/usbback/usbback.h
new file mode 100644
index 0000000..92d4f4d
--- /dev/null
+++ b/drivers/xen/usbback/usbback.h
@@ -0,0 +1,170 @@
+/*
+ * usbback.h
+ *
+ * This file is part of Xen USB backend driver.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_USBBACK_H__
+#define __XEN_USBBACK_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/usb.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/io/usbif.h>
+
+struct usbstub;
+
+#ifndef BUS_ID_SIZE
+#define USBBACK_BUS_ID_SIZE 20
+#else
+#define USBBACK_BUS_ID_SIZE BUS_ID_SIZE
+#endif
+
+#define USB_DEV_ADDR_SIZE 128
+
+typedef struct usbif_st {
+	domid_t domid;
+	unsigned int handle;
+	int num_ports;
+	enum usb_spec_version usb_ver;
+
+	struct xenbus_device *xbdev;
+	struct list_head usbif_list;
+
+	unsigned int      irq;
+
+	usbif_urb_back_ring_t urb_ring;
+	usbif_conn_back_ring_t conn_ring;
+	struct vm_struct *urb_ring_area;
+	struct vm_struct *conn_ring_area;
+
+	spinlock_t urb_ring_lock;
+	spinlock_t conn_ring_lock;
+	atomic_t refcnt;
+
+	struct xenbus_watch backend_watch;
+
+	/* device address lookup table */
+	struct usbstub *addr_table[USB_DEV_ADDR_SIZE];
+	spinlock_t addr_lock;
+
+	/* connected device list */
+	struct list_head stub_list;
+	spinlock_t stub_lock;
+
+	/* request schedule */
+	struct task_struct *xenusbd;
+	unsigned int waiting_reqs;
+	wait_queue_head_t waiting_to_free;
+	wait_queue_head_t wq;
+} usbif_t;
+
+struct vusb_port_id {
+	struct list_head id_list;
+
+	char phys_bus[USBBACK_BUS_ID_SIZE];
+	domid_t domid;
+	unsigned int handle;
+	int portnum;
+	unsigned is_connected:1;
+};
+
+struct usbstub {
+	struct kref kref;
+	struct list_head dev_list;
+
+	struct vusb_port_id *portid;
+	struct usb_device *udev;
+	usbif_t *usbif;
+	int addr;
+
+	struct list_head submitting_list;
+	spinlock_t submitting_lock;
+};
+
+usbif_t *usbif_alloc(domid_t domid, unsigned int handle);
+void usbif_disconnect(usbif_t *usbif);
+void usbif_free(usbif_t *usbif);
+int usbif_map(usbif_t *usbif, grant_ref_t urb_ring_ref,
+	      grant_ref_t conn_ring_ref, evtchn_port_t);
+
+#define usbif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define usbif_put(_b) \
+	do { \
+		if (atomic_dec_and_test(&(_b)->refcnt)) \
+			wake_up(&(_b)->waiting_to_free); \
+	} while (0)
+
+usbif_t *find_usbif(domid_t domid, unsigned int handle);
+int usbback_xenbus_init(void);
+void usbback_xenbus_exit(void);
+struct vusb_port_id *find_portid_by_busid(const char *busid);
+struct vusb_port_id *find_portid(const domid_t domid,
+						const unsigned int handle,
+						const int portnum);
+int portid_add(const char *busid,
+					const domid_t domid,
+					const unsigned int handle,
+					const int portnum);
+int portid_remove(const domid_t domid,
+					const unsigned int handle,
+					const int portnum);
+irqreturn_t usbbk_be_int(int irq, void *dev_id);
+int usbbk_schedule(void *arg);
+struct usbstub *find_attached_device(usbif_t *usbif, int port);
+void usbbk_attach_device(usbif_t *usbif, struct usbstub *stub);
+void usbbk_detach_device(usbif_t *usbif, struct usbstub *stub);
+void usbbk_hotplug_notify(usbif_t *usbif, int portnum, int speed);
+void detach_device_without_lock(usbif_t *usbif, struct usbstub *stub);
+void usbbk_unlink_urbs(struct usbstub *stub);
+
+int usbstub_init(void);
+void usbstub_exit(void);
+
+#endif /* __XEN_USBBACK_H__ */
diff --git a/drivers/xen/usbback/usbstub.c b/drivers/xen/usbback/usbstub.c
new file mode 100644
index 0000000..037755a
--- /dev/null
+++ b/drivers/xen/usbback/usbstub.c
@@ -0,0 +1,324 @@
+/*
+ * usbstub.c
+ *
+ * USB stub driver - grabbing and managing USB devices.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "usbback.h"
+
+static LIST_HEAD(port_list);
+static DEFINE_SPINLOCK(port_list_lock);
+
+struct vusb_port_id *find_portid_by_busid(const char *busid)
+{
+	struct vusb_port_id *portid;
+	int found = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_list_lock, flags);
+	list_for_each_entry(portid, &port_list, id_list) {
+		if (!(strncmp(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE))) {
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&port_list_lock, flags);
+
+	if (found)
+		return portid;
+
+	return NULL;
+}
+
+struct vusb_port_id *find_portid(const domid_t domid,
+						const unsigned int handle,
+						const int portnum)
+{
+	struct vusb_port_id *portid;
+	int found = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_list_lock, flags);
+	list_for_each_entry(portid, &port_list, id_list) {
+		if ((portid->domid == domid)
+				&& (portid->handle == handle)
+				&& (portid->portnum == portnum)) {
+				found = 1;
+				break;
+		}
+	}
+	spin_unlock_irqrestore(&port_list_lock, flags);
+
+	if (found)
+		return portid;
+
+	return NULL;
+}
+
+int portid_add(const char *busid,
+					const domid_t domid,
+					const unsigned int handle,
+					const int portnum)
+{
+	struct vusb_port_id *portid;
+	unsigned long flags;
+
+	portid = kzalloc(sizeof(*portid), GFP_KERNEL);
+	if (!portid)
+		return -ENOMEM;
+
+	portid->domid = domid;
+	portid->handle = handle;
+	portid->portnum = portnum;
+
+	strlcpy(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE);
+
+	spin_lock_irqsave(&port_list_lock, flags);
+	list_add(&portid->id_list, &port_list);
+	spin_unlock_irqrestore(&port_list_lock, flags);
+
+	return 0;
+}
+
+int portid_remove(const domid_t domid,
+					const unsigned int handle,
+					const int portnum)
+{
+	struct vusb_port_id *portid, *tmp;
+	int err = -ENOENT;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_list_lock, flags);
+	list_for_each_entry_safe(portid, tmp, &port_list, id_list) {
+		if (portid->domid == domid
+				&& portid->handle == handle
+				&& portid->portnum == portnum) {
+			list_del(&portid->id_list);
+			kfree(portid);
+
+			err = 0;
+		}
+	}
+	spin_unlock_irqrestore(&port_list_lock, flags);
+
+	return err;
+}
+
+static struct usbstub *usbstub_alloc(struct usb_device *udev,
+						struct vusb_port_id *portid)
+{
+	struct usbstub *stub;
+
+	stub = kzalloc(sizeof(*stub), GFP_KERNEL);
+	if (!stub) {
+		pr_err("no memory for usbstub\n");
+		return NULL;
+	}
+	kref_init(&stub->kref);
+	stub->udev = usb_get_dev(udev);
+	stub->portid = portid;
+	spin_lock_init(&stub->submitting_lock);
+	INIT_LIST_HEAD(&stub->submitting_list);
+
+	return stub;
+}
+
+static void usbstub_release(struct kref *kref)
+{
+	struct usbstub *stub;
+
+	stub = container_of(kref, struct usbstub, kref);
+
+	usb_put_dev(stub->udev);
+	stub->udev = NULL;
+	stub->portid = NULL;
+	kfree(stub);
+}
+
+static inline void usbstub_get(struct usbstub *stub)
+{
+	kref_get(&stub->kref);
+}
+
+static inline void usbstub_put(struct usbstub *stub)
+{
+	kref_put(&stub->kref, usbstub_release);
+}
+
+static int usbstub_probe(struct usb_interface *intf,
+		const struct usb_device_id *id)
+{
+	struct usb_device *udev = interface_to_usbdev(intf);
+	const char *busid = dev_name(intf->dev.parent);
+	struct vusb_port_id *portid = NULL;
+	struct usbstub *stub = NULL;
+	usbif_t *usbif = NULL;
+	int retval = -ENODEV;
+
+	/* hub currently not supported, so skip. */
+	if (udev->descriptor.bDeviceClass ==  USB_CLASS_HUB)
+		goto out;
+
+	portid = find_portid_by_busid(busid);
+	if (!portid)
+		goto out;
+
+	usbif = find_usbif(portid->domid, portid->handle);
+	if (!usbif)
+		goto out;
+
+	switch (udev->speed) {
+	case USB_SPEED_LOW:
+	case USB_SPEED_FULL:
+		break;
+	case USB_SPEED_HIGH:
+		if (usbif->usb_ver >= USB_VER_USB20)
+			break;
+		/* fall through */
+	default:
+		goto out;
+	}
+
+	stub = find_attached_device(usbif, portid->portnum);
+	if (!stub) {
+		/* new connection */
+		stub = usbstub_alloc(udev, portid);
+		if (!stub)
+			return -ENOMEM;
+		usbbk_attach_device(usbif, stub);
+		usbbk_hotplug_notify(usbif, portid->portnum, udev->speed);
+	} else {
+		/* maybe already called and connected by other intf */
+		if (strncmp(stub->portid->phys_bus, busid, USBBACK_BUS_ID_SIZE))
+			goto out; /* invalid call */
+	}
+
+	usbstub_get(stub);
+	usb_set_intfdata(intf, stub);
+	retval = 0;
+
+out:
+	return retval;
+}
+
+static void usbstub_disconnect(struct usb_interface *intf)
+{
+	struct usbstub *stub
+		= (struct usbstub *) usb_get_intfdata(intf);
+
+	usb_set_intfdata(intf, NULL);
+
+	if (!stub)
+		return;
+
+	if (stub->usbif) {
+		usbbk_hotplug_notify(stub->usbif, stub->portid->portnum, 0);
+		usbbk_detach_device(stub->usbif, stub);
+	}
+	usbbk_unlink_urbs(stub);
+	usbstub_put(stub);
+}
+
+static ssize_t usbstub_show_portids(struct device_driver *driver,
+		char *buf)
+{
+	struct vusb_port_id *portid;
+	size_t count = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_list_lock, flags);
+	list_for_each_entry(portid, &port_list, id_list) {
+		if (count >= PAGE_SIZE)
+			break;
+		count += scnprintf((char *)buf + count, PAGE_SIZE - count,
+				"%s:%d:%d:%d\n",
+				&portid->phys_bus[0],
+				portid->domid,
+				portid->handle,
+				portid->portnum);
+	}
+	spin_unlock_irqrestore(&port_list_lock, flags);
+
+	return count;
+}
+static DRIVER_ATTR(port_ids, S_IRUSR, usbstub_show_portids, NULL);
+
+/* table of devices that matches any usbdevice */
+static const struct usb_device_id usbstub_table[] = {
+		{ .driver_info = 1 }, /* wildcard, see usb_match_id() */
+		{ } /* Terminating entry */
+};
+MODULE_DEVICE_TABLE(usb, usbstub_table);
+
+static struct usb_driver usbback_usb_driver = {
+		.name = "usbback",
+		.probe = usbstub_probe,
+		.disconnect = usbstub_disconnect,
+		.id_table = usbstub_table,
+		.no_dynamic_id = 1,
+};
+
+int __init usbstub_init(void)
+{
+	int err;
+
+	err = usb_register(&usbback_usb_driver);
+	if (err < 0) {
+		pr_err("usbback: usb_register failed (%d)\n", err);
+		goto out;
+	}
+
+	err = driver_create_file(&usbback_usb_driver.drvwrap.driver,
+				&driver_attr_port_ids);
+	if (err)
+		usb_deregister(&usbback_usb_driver);
+
+out:
+	return err;
+}
+
+void usbstub_exit(void)
+{
+	driver_remove_file(&usbback_usb_driver.drvwrap.driver,
+				&driver_attr_port_ids);
+	usb_deregister(&usbback_usb_driver);
+}
diff --git a/drivers/xen/usbback/xenbus.c b/drivers/xen/usbback/xenbus.c
new file mode 100644
index 0000000..989b5c4
--- /dev/null
+++ b/drivers/xen/usbback/xenbus.c
@@ -0,0 +1,334 @@
+/*
+ * xenbus.c
+ *
+ * Xenbus interface for USB backend driver.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "usbback.h"
+
+static int start_xenusbd(usbif_t *usbif)
+{
+	int err = 0;
+	char name[TASK_COMM_LEN];
+
+	snprintf(name, TASK_COMM_LEN, "usbback.%d.%d", usbif->domid,
+			usbif->handle);
+	usbif->xenusbd = kthread_run(usbbk_schedule, usbif, name);
+	if (IS_ERR(usbif->xenusbd)) {
+		err = PTR_ERR(usbif->xenusbd);
+		usbif->xenusbd = NULL;
+		xenbus_dev_error(usbif->xbdev, err, "start xenusbd");
+	}
+
+	return err;
+}
+
+static void backend_changed(struct xenbus_watch *watch,
+			const char **vec, unsigned int len)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	int i;
+	char node[8];
+	char *busid;
+	struct vusb_port_id *portid = NULL;
+
+	usbif_t *usbif = container_of(watch, usbif_t, backend_watch);
+	struct xenbus_device *dev = usbif->xbdev;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		return;
+	}
+
+	for (i = 1; i <= usbif->num_ports; i++) {
+		sprintf(node, "port/%d", i);
+		busid = xenbus_read(xbt, dev->nodename, node, NULL);
+		if (IS_ERR(busid)) {
+			err = PTR_ERR(busid);
+			xenbus_dev_fatal(dev, err, "reading port/%d", i);
+			goto abort;
+		}
+
+		/*
+		 * remove portid, if the port is not connected,
+		 */
+		if (strlen(busid) == 0) {
+			portid = find_portid(usbif->domid, usbif->handle, i);
+			if (portid) {
+				if (portid->is_connected)
+					xenbus_dev_fatal(dev, err,
+						"can't remove port/%d, unbind first", i);
+				else
+					portid_remove(usbif->domid, usbif->handle, i);
+			}
+			continue; /* never configured, ignore */
+		}
+
+		/*
+		 * add portid,
+		 * if the port is not configured and not used from other usbif.
+		 */
+		portid = find_portid(usbif->domid, usbif->handle, i);
+		if (portid) {
+			if ((strncmp(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE)))
+				xenbus_dev_fatal(dev, err,
+					"can't add port/%d, remove first", i);
+			else
+				continue; /* already configured, ignore */
+		} else {
+			if (find_portid_by_busid(busid))
+				xenbus_dev_fatal(dev, err,
+					"can't add port/%d, busid already used", i);
+			else
+				portid_add(busid, usbif->domid, usbif->handle, i);
+		}
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		xenbus_dev_fatal(dev, err, "completing transaction");
+
+	return;
+
+abort:
+	xenbus_transaction_end(xbt, 1);
+
+	return;
+}
+
+static int usbback_remove(struct xenbus_device *dev)
+{
+	usbif_t *usbif = dev_get_drvdata(&dev->dev);
+	int i;
+
+	if (usbif->backend_watch.node) {
+		unregister_xenbus_watch(&usbif->backend_watch);
+		kfree(usbif->backend_watch.node);
+		usbif->backend_watch.node = NULL;
+	}
+
+	if (usbif) {
+		/* remove all ports */
+		for (i = 1; i <= usbif->num_ports; i++)
+			portid_remove(usbif->domid, usbif->handle, i);
+		usbif_disconnect(usbif);
+		usbif_free(usbif);;
+	}
+	dev_set_drvdata(&dev->dev, NULL);
+
+	return 0;
+}
+
+static int usbback_probe(struct xenbus_device *dev,
+			  const struct xenbus_device_id *id)
+{
+	usbif_t *usbif;
+	unsigned int handle;
+	int num_ports;
+	int usb_ver;
+	int err;
+
+	if (usb_disabled())
+		return -ENODEV;
+
+	handle = simple_strtoul(strrchr(dev->otherend, '/') + 1, NULL, 0);
+	usbif = usbif_alloc(dev->otherend_id, handle);
+	if (!usbif) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating backend interface");
+		return -ENOMEM;
+	}
+	usbif->xbdev = dev;
+	dev_set_drvdata(&dev->dev, usbif);
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename,
+				"num-ports", "%d", &num_ports);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading num-ports");
+		goto fail;
+	}
+	if (num_ports < 1 || num_ports > USB_MAXCHILDREN) {
+		xenbus_dev_fatal(dev, err, "invalid num-ports");
+		goto fail;
+	}
+	usbif->num_ports = num_ports;
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename,
+				"usb-ver", "%d", &usb_ver);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading usb-ver");
+		goto fail;
+	}
+	switch (usb_ver) {
+	case USB_VER_USB11:
+	case USB_VER_USB20:
+		usbif->usb_ver = usb_ver;
+		break;
+	default:
+		xenbus_dev_fatal(dev, err, "invalid usb-ver");
+		goto fail;
+	}
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	usbback_remove(dev);
+	return err;
+}
+
+static int connect_rings(usbif_t *usbif)
+{
+	struct xenbus_device *dev = usbif->xbdev;
+	unsigned int urb_ring_ref, conn_ring_ref, evtchn;
+	int err;
+
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			    "urb-ring-ref", "%u", &urb_ring_ref,
+			    "conn-ring-ref", "%u", &conn_ring_ref,
+			    "event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				 "reading %s/ring-ref and event-channel",
+				 dev->otherend);
+		return err;
+	}
+
+	pr_info("usbback: urb-ring-ref %u, conn-ring-ref %u,"
+		" event-channel %u\n",
+		urb_ring_ref, conn_ring_ref, evtchn);
+
+	err = usbif_map(usbif, urb_ring_ref, conn_ring_ref, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err,
+				"mapping urb-ring-ref %u conn-ring-ref %u port %u",
+				urb_ring_ref, conn_ring_ref, evtchn);
+		return err;
+	}
+
+	return 0;
+}
+
+static void frontend_changed(struct xenbus_device *dev,
+				     enum xenbus_state frontend_state)
+{
+	usbif_t *usbif = dev_get_drvdata(&dev->dev);
+	int err;
+
+	switch (frontend_state) {
+	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+		break;
+
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			pr_info("%s: %s: prepare for reconnect\n",
+				__FUNCTION__, dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateConnected:
+		if (dev->state == XenbusStateConnected)
+			break;
+		err = connect_rings(usbif);
+		if (err)
+			break;
+		err = start_xenusbd(usbif);
+		if (err)
+			break;
+		err = xenbus_watch_path2(dev, dev->nodename, "port",
+					&usbif->backend_watch, backend_changed);
+		if (err)
+			break;
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosing:
+		usbif_disconnect(usbif);
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+static const struct xenbus_device_id usbback_ids[] = {
+	{ "vusb" },
+	{ "" },
+};
+
+static DEFINE_XENBUS_DRIVER(usbback, ,
+	.probe = usbback_probe,
+	.otherend_changed = frontend_changed,
+	.remove = usbback_remove,
+);
+
+int __init usbback_xenbus_init(void)
+{
+	return xenbus_register_backend(&usbback_driver);
+}
+
+void __exit usbback_xenbus_exit(void)
+{
+	xenbus_unregister_driver(&usbback_driver);
+}
diff --git a/drivers/xen/usbfront/Makefile b/drivers/xen/usbfront/Makefile
new file mode 100644
index 0000000..034ba96
--- /dev/null
+++ b/drivers/xen/usbfront/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_XEN_USB_FRONTEND) := xen-hcd.o
+
+xen-hcd-y   := usbfront-hcd.o xenbus.o
+
+ifeq ($(CONFIG_XEN_USB_FRONTEND_HCD_STATS),y)
+EXTRA_CFLAGS += -DXENHCD_STATS
+endif
+
+ifeq ($(CONFIG_XEN_USB_FRONTEND_HCD_PM),y)
+EXTRA_CFLAGS += -DXENHCD_PM
+endif
diff --git a/drivers/xen/usbfront/usbfront-dbg.c b/drivers/xen/usbfront/usbfront-dbg.c
new file mode 100644
index 0000000..647e3fe
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-dbg.c
@@ -0,0 +1,101 @@
+/*
+ * usbfront-dbg.c
+ *
+ * Xen USB Virtual Host Controller - debugging
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+static ssize_t show_statistics(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct usb_hcd *hcd;
+	struct usbfront_info *info;
+	unsigned long flags;
+	unsigned temp, size;
+	char *next;
+
+	hcd = dev_get_drvdata(dev);
+	info = hcd_to_info(hcd);
+	next = buf;
+	size = PAGE_SIZE;
+
+	spin_lock_irqsave(&info->lock, flags);
+
+	temp = scnprintf(next, size,
+			"bus %s, device %s\n"
+			"%s\n"
+			"xenhcd, hcd state %d\n",
+			hcd->self.controller->bus->name,
+			dev_name(hcd->self.controller),
+			hcd->product_desc,
+			hcd->state);
+	size -= temp;
+	next += temp;
+
+#ifdef XENHCD_STATS
+	temp = scnprintf(next, size,
+		"complete %ld unlink %ld ring_full %ld\n",
+		info->stats.complete, info->stats.unlink,
+		info->stats.ring_full);
+	size -= temp;
+	next += temp;
+#endif
+
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	return PAGE_SIZE - size;
+}
+
+static DEVICE_ATTR(statistics, S_IRUGO, show_statistics, NULL);
+
+static inline void create_debug_file(struct usbfront_info *info)
+{
+	struct device *dev = info_to_hcd(info)->self.controller;
+	if (device_create_file(dev, &dev_attr_statistics))
+		pr_warning("statistics file not created for %s\n",
+			   info_to_hcd(info)->self.bus_name);
+}
+
+static inline void remove_debug_file(struct usbfront_info *info)
+{
+	struct device *dev = info_to_hcd(info)->self.controller;
+	device_remove_file(dev, &dev_attr_statistics);
+}
diff --git a/drivers/xen/usbfront/usbfront-hcd.c b/drivers/xen/usbfront/usbfront-hcd.c
new file mode 100644
index 0000000..83c469b
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-hcd.c
@@ -0,0 +1,232 @@
+/*
+ * usbfront-hcd.c
+ *
+ * Xen USB Virtual Host Controller driver
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "usbfront.h"
+#include "usbfront-dbg.c"
+#include "usbfront-hub.c"
+#include "usbfront-q.c"
+
+static void xenhcd_watchdog(unsigned long param)
+{
+	struct usbfront_info *info = (struct usbfront_info *) param;
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock, flags);
+	if (likely(HC_IS_RUNNING(info_to_hcd(info)->state))) {
+		timer_action_done(info, TIMER_RING_WATCHDOG);
+		xenhcd_giveback_unlinked_urbs(info);
+		xenhcd_kick_pending_urbs(info);
+	}
+	spin_unlock_irqrestore(&info->lock, flags);
+}
+
+/*
+ * one-time HC init
+ */
+static int xenhcd_setup(struct usb_hcd *hcd)
+{
+	struct usbfront_info *info = hcd_to_info(hcd);
+
+	spin_lock_init(&info->lock);
+	INIT_LIST_HEAD(&info->pending_submit_list);
+	INIT_LIST_HEAD(&info->pending_unlink_list);
+	INIT_LIST_HEAD(&info->in_progress_list);
+	INIT_LIST_HEAD(&info->giveback_waiting_list);
+	init_timer(&info->watchdog);
+	info->watchdog.function = xenhcd_watchdog;
+	info->watchdog.data = (unsigned long) info;
+	return 0;
+}
+
+/*
+ * start HC running
+ */
+static int xenhcd_run(struct usb_hcd *hcd)
+{
+	hcd->uses_new_polling = 1;
+	clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
+	hcd->state = HC_STATE_RUNNING;
+	create_debug_file(hcd_to_info(hcd));
+	return 0;
+}
+
+/*
+ * stop running HC
+ */
+static void xenhcd_stop(struct usb_hcd *hcd)
+{
+	struct usbfront_info *info = hcd_to_info(hcd);
+
+	del_timer_sync(&info->watchdog);
+	remove_debug_file(info);
+	spin_lock_irq(&info->lock);
+	/* cancel all urbs */
+	hcd->state = HC_STATE_HALT;
+	xenhcd_cancel_all_enqueued_urbs(info);
+	xenhcd_giveback_unlinked_urbs(info);
+	spin_unlock_irq(&info->lock);
+}
+
+/*
+ * called as .urb_enqueue()
+ * non-error returns are promise to giveback the urb later
+ */
+static int xenhcd_urb_enqueue(struct usb_hcd *hcd,
+				    struct urb *urb,
+				    gfp_t mem_flags)
+{
+	struct usbfront_info *info = hcd_to_info(hcd);
+	struct urb_priv *urbp;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&info->lock, flags);
+
+	urbp = alloc_urb_priv(urb);
+	if (!urbp) {
+		ret = -ENOMEM;
+		goto done;
+	}
+	urbp->status = 1;
+
+	ret = xenhcd_submit_urb(info, urbp);
+	if (ret != 0)
+		free_urb_priv(urbp);
+
+done:
+	spin_unlock_irqrestore(&info->lock, flags);
+	return ret;
+}
+
+/*
+ * called as .urb_dequeue()
+ */
+static int xenhcd_urb_dequeue(struct usb_hcd *hcd,
+			      struct urb *urb, int status)
+{
+	struct usbfront_info *info = hcd_to_info(hcd);
+	struct urb_priv *urbp;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&info->lock, flags);
+
+	urbp = urb->hcpriv;
+	if (!urbp)
+		goto done;
+
+	urbp->status = status;
+	ret = xenhcd_unlink_urb(info, urbp);
+
+done:
+	spin_unlock_irqrestore(&info->lock, flags);
+	return ret;
+}
+
+/*
+ * called from usb_get_current_frame_number(),
+ * but, almost all drivers not use such function.
+ */
+static int xenhcd_get_frame(struct usb_hcd *hcd)
+{
+	/* it means error, but probably no problem :-) */
+	return 0;
+}
+
+static const char hcd_name[] = "xen_hcd";
+
+struct hc_driver xen_usb20_hc_driver = {
+	.description = hcd_name,
+	.product_desc = "Xen USB2.0 Virtual Host Controller",
+	.hcd_priv_size = sizeof(struct usbfront_info),
+	.flags = HCD_USB2,
+
+	/* basic HC lifecycle operations */
+	.reset = xenhcd_setup,
+	.start = xenhcd_run,
+	.stop = xenhcd_stop,
+
+	/* managing urb I/O */
+	.urb_enqueue = xenhcd_urb_enqueue,
+	.urb_dequeue = xenhcd_urb_dequeue,
+	.get_frame_number = xenhcd_get_frame,
+
+	/* root hub operations */
+	.hub_status_data = xenhcd_hub_status_data,
+	.hub_control = xenhcd_hub_control,
+#ifdef XENHCD_PM
+#ifdef CONFIG_PM
+	.bus_suspend = xenhcd_bus_suspend,
+	.bus_resume = xenhcd_bus_resume,
+#endif
+#endif
+};
+
+struct hc_driver xen_usb11_hc_driver = {
+	.description = hcd_name,
+	.product_desc = "Xen USB1.1 Virtual Host Controller",
+	.hcd_priv_size = sizeof(struct usbfront_info),
+	.flags = HCD_USB11,
+
+	/* basic HC lifecycle operations */
+	.reset = xenhcd_setup,
+	.start = xenhcd_run,
+	.stop = xenhcd_stop,
+
+	/* managing urb I/O */
+	.urb_enqueue = xenhcd_urb_enqueue,
+	.urb_dequeue = xenhcd_urb_dequeue,
+	.get_frame_number = xenhcd_get_frame,
+
+	/* root hub operations */
+	.hub_status_data = xenhcd_hub_status_data,
+	.hub_control = xenhcd_hub_control,
+#ifdef XENHCD_PM
+#ifdef CONFIG_PM
+	.bus_suspend = xenhcd_bus_suspend,
+	.bus_resume = xenhcd_bus_resume,
+#endif
+#endif
+};
diff --git a/drivers/xen/usbfront/usbfront-hub.c b/drivers/xen/usbfront/usbfront-hub.c
new file mode 100644
index 0000000..1a0bfa3
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-hub.c
@@ -0,0 +1,471 @@
+/*
+ * usbfront-hub.c
+ *
+ * Xen USB Virtual Host Controller - Root Hub Emulations
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * set virtual port connection status
+ */
+void set_connect_state(struct usbfront_info *info, int portnum)
+{
+	int port;
+
+	port = portnum - 1;
+	if (info->ports[port].status & USB_PORT_STAT_POWER) {
+		switch (info->devices[port].speed) {
+		case USB_SPEED_UNKNOWN:
+			info->ports[port].status &=
+				~(USB_PORT_STAT_CONNECTION |
+					USB_PORT_STAT_ENABLE |
+					USB_PORT_STAT_LOW_SPEED |
+					USB_PORT_STAT_HIGH_SPEED |
+					USB_PORT_STAT_SUSPEND);
+			break;
+		case USB_SPEED_LOW:
+			info->ports[port].status |= USB_PORT_STAT_CONNECTION;
+			info->ports[port].status |= USB_PORT_STAT_LOW_SPEED;
+			break;
+		case USB_SPEED_FULL:
+			info->ports[port].status |= USB_PORT_STAT_CONNECTION;
+			break;
+		case USB_SPEED_HIGH:
+			info->ports[port].status |= USB_PORT_STAT_CONNECTION;
+			info->ports[port].status |= USB_PORT_STAT_HIGH_SPEED;
+			break;
+		default: /* error */
+			return;
+		}
+		info->ports[port].status |= (USB_PORT_STAT_C_CONNECTION << 16);
+	}
+}
+
+/*
+ * set virtual device connection status
+ */
+void rhport_connect(struct usbfront_info *info,
+				int portnum, enum usb_device_speed speed)
+{
+	int port;
+
+	if (portnum < 1 || portnum > info->rh_numports)
+		return; /* invalid port number */
+
+	port = portnum - 1;
+	if (info->devices[port].speed != speed) {
+		switch (speed) {
+		case USB_SPEED_UNKNOWN: /* disconnect */
+			info->devices[port].status = USB_STATE_NOTATTACHED;
+			break;
+		case USB_SPEED_LOW:
+		case USB_SPEED_FULL:
+		case USB_SPEED_HIGH:
+			info->devices[port].status = USB_STATE_ATTACHED;
+			break;
+		default: /* error */
+			return;
+		}
+		info->devices[port].speed = speed;
+		info->ports[port].c_connection = 1;
+
+		set_connect_state(info, portnum);
+	}
+}
+
+/*
+ * SetPortFeature(PORT_SUSPENDED)
+ */
+void rhport_suspend(struct usbfront_info *info, int portnum)
+{
+	int port;
+
+	port = portnum - 1;
+	info->ports[port].status |= USB_PORT_STAT_SUSPEND;
+	info->devices[port].status = USB_STATE_SUSPENDED;
+}
+
+/*
+ * ClearPortFeature(PORT_SUSPENDED)
+ */
+void rhport_resume(struct usbfront_info *info, int portnum)
+{
+	int port;
+
+	port = portnum - 1;
+	if (info->ports[port].status & USB_PORT_STAT_SUSPEND) {
+		info->ports[port].resuming = 1;
+		info->ports[port].timeout = jiffies + msecs_to_jiffies(20);
+	}
+}
+
+/*
+ * SetPortFeature(PORT_POWER)
+ */
+void rhport_power_on(struct usbfront_info *info, int portnum)
+{
+	int port;
+
+	port = portnum - 1;
+	if ((info->ports[port].status & USB_PORT_STAT_POWER) == 0) {
+		info->ports[port].status |= USB_PORT_STAT_POWER;
+		if (info->devices[port].status != USB_STATE_NOTATTACHED)
+			info->devices[port].status = USB_STATE_POWERED;
+		if (info->ports[port].c_connection)
+			set_connect_state(info, portnum);
+	}
+}
+
+/*
+ * ClearPortFeature(PORT_POWER)
+ * SetConfiguration(non-zero)
+ * Power_Source_Off
+ * Over-current
+ */
+void rhport_power_off(struct usbfront_info *info, int portnum)
+{
+	int port;
+
+	port = portnum - 1;
+	if (info->ports[port].status & USB_PORT_STAT_POWER) {
+		info->ports[port].status = 0;
+		if (info->devices[port].status != USB_STATE_NOTATTACHED)
+			info->devices[port].status = USB_STATE_ATTACHED;
+	}
+}
+
+/*
+ * ClearPortFeature(PORT_ENABLE)
+ */
+void rhport_disable(struct usbfront_info *info, int portnum)
+{
+	int port;
+
+	port = portnum - 1;
+	info->ports[port].status &= ~USB_PORT_STAT_ENABLE;
+	info->ports[port].status &= ~USB_PORT_STAT_SUSPEND;
+	info->ports[port].resuming = 0;
+	if (info->devices[port].status != USB_STATE_NOTATTACHED)
+		info->devices[port].status = USB_STATE_POWERED;
+}
+
+/*
+ * SetPortFeature(PORT_RESET)
+ */
+void rhport_reset(struct usbfront_info *info, int portnum)
+{
+	int port;
+
+	port = portnum - 1;
+	info->ports[port].status &= ~(USB_PORT_STAT_ENABLE
+					| USB_PORT_STAT_LOW_SPEED
+					| USB_PORT_STAT_HIGH_SPEED);
+	info->ports[port].status |= USB_PORT_STAT_RESET;
+
+	if (info->devices[port].status != USB_STATE_NOTATTACHED)
+		info->devices[port].status = USB_STATE_ATTACHED;
+
+	/* 10msec reset signaling */
+	info->ports[port].timeout = jiffies + msecs_to_jiffies(10);
+}
+
+#ifdef XENHCD_PM
+#ifdef CONFIG_PM
+static int xenhcd_bus_suspend(struct usb_hcd *hcd)
+{
+	struct usbfront_info *info = hcd_to_info(hcd);
+	int ret = 0;
+	int i, ports;
+
+	ports = info->rh_numports;
+
+	spin_lock_irq(&info->lock);
+	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+		ret = -ESHUTDOWN;
+	else {
+		/* suspend any active ports*/
+		for (i = 1; i <= ports; i++)
+			rhport_suspend(info, i);
+	}
+	spin_unlock_irq(&info->lock);
+
+	del_timer_sync(&info->watchdog);
+
+	return ret;
+}
+
+static int xenhcd_bus_resume(struct usb_hcd *hcd)
+{
+	struct usbfront_info *info = hcd_to_info(hcd);
+	int ret = 0;
+	int i, ports;
+
+	ports = info->rh_numports;
+
+	spin_lock_irq(&info->lock);
+	if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+		ret = -ESHUTDOWN;
+	else {
+		/* resume any suspended ports*/
+		for (i = 1; i <= ports; i++)
+			rhport_resume(info, i);
+	}
+	spin_unlock_irq(&info->lock);
+
+	return ret;
+}
+#endif
+#endif
+
+static void xenhcd_hub_descriptor(struct usbfront_info *info,
+				  struct usb_hub_descriptor *desc)
+{
+	u16 temp;
+	int ports = info->rh_numports;
+
+	desc->bDescriptorType = 0x29;
+	desc->bPwrOn2PwrGood = 10; /* EHCI says 20ms max */
+	desc->bHubContrCurrent = 0;
+	desc->bNbrPorts = ports;
+
+	/* size of DeviceRemovable and PortPwrCtrlMask fields*/
+	temp = 1 + (ports / 8);
+	desc->bDescLength = 7 + 2 * temp;
+
+	/* bitmaps for DeviceRemovable and PortPwrCtrlMask */
+	memset(&desc->u.hs.DeviceRemovable[0], 0, temp);
+	memset(&desc->u.hs.DeviceRemovable[temp], 0xff, temp);
+
+	/* per-port over current reporting and no power switching */
+	temp = 0x000a;
+	desc->wHubCharacteristics = cpu_to_le16(temp);
+}
+
+/* port status change mask for hub_status_data */
+#define PORT_C_MASK \
+	((USB_PORT_STAT_C_CONNECTION \
+	| USB_PORT_STAT_C_ENABLE \
+	| USB_PORT_STAT_C_SUSPEND \
+	| USB_PORT_STAT_C_OVERCURRENT \
+	| USB_PORT_STAT_C_RESET) << 16)
+
+/*
+ * See USB 2.0 Spec, 11.12.4 Hub and Port Status Change Bitmap.
+ * If port status changed, writes the bitmap to buf and return
+ * that length(number of bytes).
+ * If Nothing changed, return 0.
+ */
+static int xenhcd_hub_status_data(struct usb_hcd *hcd, char *buf)
+{
+	struct usbfront_info *info = hcd_to_info(hcd);
+
+	int ports;
+	int i;
+	int length;
+
+	unsigned long flags;
+	int ret = 0;
+
+	int changed = 0;
+
+	if (!HC_IS_RUNNING(hcd->state))
+		return 0;
+
+	/* initialize the status to no-changes */
+	ports = info->rh_numports;
+	length = 1 + (ports / 8);
+	for (i = 0; i < length; i++) {
+		buf[i] = 0;
+		ret++;
+	}
+
+	spin_lock_irqsave(&info->lock, flags);
+
+	for (i = 0; i < ports; i++) {
+		/* check status for each port */
+		if (info->ports[i].status & PORT_C_MASK) {
+			if (i < 7)
+				buf[0] |= 1 << (i + 1);
+			else if (i < 15)
+				buf[1] |= 1 << (i - 7);
+			else if (i < 23)
+				buf[2] |= 1 << (i - 15);
+			else
+				buf[3] |= 1 << (i - 23);
+			changed = 1;
+		}
+	}
+
+	if (!changed)
+		ret = 0;
+
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	return ret;
+}
+
+static int xenhcd_hub_control(struct usb_hcd *hcd,
+			       u16 typeReq,
+			       u16 wValue,
+			       u16 wIndex,
+			       char *buf,
+			       u16 wLength)
+{
+	struct usbfront_info *info = hcd_to_info(hcd);
+	int ports = info->rh_numports;
+	unsigned long flags;
+	int ret = 0;
+	int i;
+	int changed = 0;
+
+	spin_lock_irqsave(&info->lock, flags);
+	switch (typeReq) {
+	case ClearHubFeature:
+		/* ignore this request */
+		break;
+	case ClearPortFeature:
+		if (!wIndex || wIndex > ports)
+			goto error;
+
+		switch (wValue) {
+		case USB_PORT_FEAT_SUSPEND:
+			rhport_resume(info, wIndex);
+			break;
+		case USB_PORT_FEAT_POWER:
+			rhport_power_off(info, wIndex);
+			break;
+		case USB_PORT_FEAT_ENABLE:
+			rhport_disable(info, wIndex);
+			break;
+		case USB_PORT_FEAT_C_CONNECTION:
+			info->ports[wIndex-1].c_connection = 0;
+			/* falling through */
+		default:
+			info->ports[wIndex-1].status &= ~(1 << wValue);
+			break;
+		}
+		break;
+	case GetHubDescriptor:
+		xenhcd_hub_descriptor(info,
+				      (struct usb_hub_descriptor *) buf);
+		break;
+	case GetHubStatus:
+		/* always local power supply good and no over-current exists. */
+		*(__le32 *)buf = cpu_to_le32(0);
+		break;
+	case GetPortStatus:
+		if (!wIndex || wIndex > ports)
+			goto error;
+
+		wIndex--;
+
+		/* resume completion */
+		if (info->ports[wIndex].resuming &&
+			time_after_eq(jiffies, info->ports[wIndex].timeout)) {
+			info->ports[wIndex].status |= (USB_PORT_STAT_C_SUSPEND << 16);
+			info->ports[wIndex].status &= ~USB_PORT_STAT_SUSPEND;
+		}
+
+		/* reset completion */
+		if ((info->ports[wIndex].status & USB_PORT_STAT_RESET) != 0 &&
+			time_after_eq(jiffies, info->ports[wIndex].timeout)) {
+			info->ports[wIndex].status |= (USB_PORT_STAT_C_RESET << 16);
+			info->ports[wIndex].status &= ~USB_PORT_STAT_RESET;
+
+			if (info->devices[wIndex].status != USB_STATE_NOTATTACHED) {
+				info->ports[wIndex].status |= USB_PORT_STAT_ENABLE;
+				info->devices[wIndex].status = USB_STATE_DEFAULT;
+			}
+
+			switch (info->devices[wIndex].speed) {
+			case USB_SPEED_LOW:
+				info->ports[wIndex].status |= USB_PORT_STAT_LOW_SPEED;
+				break;
+			case USB_SPEED_HIGH:
+				info->ports[wIndex].status |= USB_PORT_STAT_HIGH_SPEED;
+				break;
+			default:
+				break;
+			}
+		}
+
+		((u16 *) buf)[0] = cpu_to_le16 (info->ports[wIndex].status);
+		((u16 *) buf)[1] = cpu_to_le16 (info->ports[wIndex].status >> 16);
+		break;
+	case SetHubFeature:
+		/* not supported */
+		goto error;
+	case SetPortFeature:
+		if (!wIndex || wIndex > ports)
+			goto error;
+
+		switch (wValue) {
+		case USB_PORT_FEAT_POWER:
+			rhport_power_on(info, wIndex);
+			break;
+		case USB_PORT_FEAT_RESET:
+			rhport_reset(info, wIndex);
+			break;
+		case USB_PORT_FEAT_SUSPEND:
+			rhport_suspend(info, wIndex);
+			break;
+		default:
+			if ((info->ports[wIndex-1].status & USB_PORT_STAT_POWER) != 0)
+				info->ports[wIndex-1].status |= (1 << wValue);
+		}
+		break;
+
+	default:
+error:
+		ret = -EPIPE;
+	}
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	/* check status for each port */
+	for (i = 0; i < ports; i++) {
+		if (info->ports[i].status & PORT_C_MASK)
+			changed = 1;
+	}
+	if (changed)
+		usb_hcd_poll_rh_status(hcd);
+
+	return ret;
+}
diff --git a/drivers/xen/usbfront/usbfront-q.c b/drivers/xen/usbfront/usbfront-q.c
new file mode 100644
index 0000000..90dd57f
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-q.c
@@ -0,0 +1,542 @@
+/*
+ * usbfront-q.c
+ *
+ * Xen USB Virtual Host Controller - RING operations.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+struct kmem_cache *xenhcd_urbp_cachep;
+
+static struct urb_priv *alloc_urb_priv(struct urb *urb)
+{
+	struct urb_priv *urbp;
+
+	urbp = kmem_cache_zalloc(xenhcd_urbp_cachep, GFP_ATOMIC);
+	if (!urbp)
+		return NULL;
+
+	urbp->urb = urb;
+	urb->hcpriv = urbp;
+	urbp->req_id = ~0;
+	urbp->unlink_req_id = ~0;
+	INIT_LIST_HEAD(&urbp->list);
+
+	return urbp;
+}
+
+static void free_urb_priv(struct urb_priv *urbp)
+{
+	urbp->urb->hcpriv = NULL;
+	kmem_cache_free(xenhcd_urbp_cachep, urbp);
+}
+
+static inline int get_id_from_freelist(
+	struct usbfront_info *info)
+{
+	unsigned long free;
+	free = info->shadow_free;
+	BUG_ON(free >= USB_URB_RING_SIZE);
+	info->shadow_free = info->shadow[free].req.id;
+	info->shadow[free].req.id = (unsigned int)0x0fff; /* debug */
+	return free;
+}
+
+static inline void add_id_to_freelist(
+	struct usbfront_info *info, unsigned long id)
+{
+	info->shadow[id].req.id  = info->shadow_free;
+	info->shadow[id].urb = NULL;
+	info->shadow_free = id;
+}
+
+static inline int count_pages(void *addr, int length)
+{
+	unsigned long start = (unsigned long) addr >> PAGE_SHIFT;
+	unsigned long end = (unsigned long) (addr + length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	return end - start;
+}
+
+static inline void xenhcd_gnttab_map(struct usbfront_info *info,
+		void *addr, int length, grant_ref_t *gref_head,
+		struct usbif_request_segment *seg, int nr_pages, int flags)
+{
+	grant_ref_t ref;
+	struct page *page;
+	unsigned long buffer_pfn;
+	unsigned int offset;
+	unsigned int len;
+	unsigned int bytes;
+	int i;
+
+	len = length;
+
+	for (i = 0; i < nr_pages; i++) {
+		BUG_ON(!len);
+
+		page = virt_to_page(addr);
+		buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
+		offset = offset_in_page(addr);
+
+		bytes = PAGE_SIZE - offset;
+		if (bytes > len)
+			bytes = len;
+
+		ref = gnttab_claim_grant_reference(gref_head);
+		BUG_ON(ref == -ENOSPC);
+		gnttab_grant_foreign_access_ref(ref, info->xbdev->otherend_id, buffer_pfn, flags);
+		seg[i].gref = ref;
+		seg[i].offset = (uint16_t)offset;
+		seg[i].length = (uint16_t)bytes;
+
+		addr += bytes;
+		len -= bytes;
+	}
+}
+
+static int map_urb_for_request(struct usbfront_info *info, struct urb *urb,
+		usbif_urb_request_t *req)
+{
+	grant_ref_t gref_head;
+	int nr_buff_pages = 0;
+	int nr_isodesc_pages = 0;
+	int ret = 0;
+
+	if (urb->transfer_buffer_length) {
+		nr_buff_pages = count_pages(urb->transfer_buffer, urb->transfer_buffer_length);
+
+		if (usb_pipeisoc(urb->pipe))
+			nr_isodesc_pages = count_pages(&urb->iso_frame_desc[0],
+					sizeof(struct usb_iso_packet_descriptor) * urb->number_of_packets);
+
+		if (nr_buff_pages + nr_isodesc_pages > USBIF_MAX_SEGMENTS_PER_REQUEST)
+			return -E2BIG;
+
+		ret = gnttab_alloc_grant_references(USBIF_MAX_SEGMENTS_PER_REQUEST, &gref_head);
+		if (ret) {
+			pr_err("usbfront: gnttab_alloc_grant_references() error\n");
+			return -ENOMEM;
+		}
+
+		xenhcd_gnttab_map(info, urb->transfer_buffer,
+				urb->transfer_buffer_length,
+				&gref_head, &req->seg[0], nr_buff_pages,
+				usb_pipein(urb->pipe) ? 0 : GTF_readonly);
+
+		if (!usb_pipeisoc(urb->pipe))
+			gnttab_free_grant_references(gref_head);
+	}
+
+	req->pipe = usbif_setportnum_pipe(urb->pipe, urb->dev->portnum);
+	req->transfer_flags = urb->transfer_flags;
+	req->buffer_length = urb->transfer_buffer_length;
+	req->nr_buffer_segs = nr_buff_pages;
+
+	switch (usb_pipetype(urb->pipe)) {
+	case PIPE_ISOCHRONOUS:
+		req->u.isoc.interval = urb->interval;
+		req->u.isoc.start_frame = urb->start_frame;
+		req->u.isoc.number_of_packets = urb->number_of_packets;
+		req->u.isoc.nr_frame_desc_segs = nr_isodesc_pages;
+		/* urb->number_of_packets must be > 0 */
+		if (unlikely(urb->number_of_packets <= 0))
+			BUG();
+		xenhcd_gnttab_map(info, &urb->iso_frame_desc[0],
+			sizeof(struct usb_iso_packet_descriptor) * urb->number_of_packets,
+			&gref_head, &req->seg[nr_buff_pages], nr_isodesc_pages, 0);
+		gnttab_free_grant_references(gref_head);
+		break;
+	case PIPE_INTERRUPT:
+		req->u.intr.interval = urb->interval;
+		break;
+	case PIPE_CONTROL:
+		if (urb->setup_packet)
+			memcpy(req->u.ctrl, urb->setup_packet, 8);
+		break;
+	case PIPE_BULK:
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void xenhcd_gnttab_done(struct usb_shadow *shadow)
+{
+	int nr_segs = 0;
+	int i;
+
+	nr_segs = shadow->req.nr_buffer_segs;
+
+	if (usb_pipeisoc(shadow->req.pipe))
+		nr_segs +=  shadow->req.u.isoc.nr_frame_desc_segs;
+
+	for (i = 0; i < nr_segs; i++)
+		gnttab_end_foreign_access(shadow->req.seg[i].gref, 0UL);
+
+	shadow->req.nr_buffer_segs = 0;
+	shadow->req.u.isoc.nr_frame_desc_segs = 0;
+}
+
+static void xenhcd_giveback_urb(struct usbfront_info *info, struct urb *urb, int status)
+__releases(info->lock)
+__acquires(info->lock)
+{
+	struct urb_priv *urbp = (struct urb_priv *) urb->hcpriv;
+
+	list_del_init(&urbp->list);
+	free_urb_priv(urbp);
+	switch (urb->status) {
+	case -ECONNRESET:
+	case -ENOENT:
+		COUNT(info->stats.unlink);
+		break;
+	case -EINPROGRESS:
+		urb->status = status;
+		/* falling through */
+	default:
+		COUNT(info->stats.complete);
+	}
+	spin_unlock(&info->lock);
+	usb_hcd_giveback_urb(info_to_hcd(info), urb,
+			     urbp->status <= 0 ? urbp->status : urb->status);
+	spin_lock(&info->lock);
+}
+
+static inline int xenhcd_do_request(struct usbfront_info *info, struct urb_priv *urbp)
+{
+	usbif_urb_request_t *req;
+	struct urb *urb = urbp->urb;
+	uint16_t id;
+	int notify;
+	int ret = 0;
+
+	req = RING_GET_REQUEST(&info->urb_ring, info->urb_ring.req_prod_pvt);
+	id = get_id_from_freelist(info);
+	req->id = id;
+
+	if (unlikely(urbp->unlinked)) {
+		req->u.unlink.unlink_id = urbp->req_id;
+		req->pipe = usbif_setunlink_pipe(usbif_setportnum_pipe(
+				urb->pipe, urb->dev->portnum));
+		urbp->unlink_req_id = id;
+	} else {
+		ret = map_urb_for_request(info, urb, req);
+		if (ret < 0) {
+			add_id_to_freelist(info, id);
+			return ret;
+		}
+		urbp->req_id = id;
+	}
+
+	info->urb_ring.req_prod_pvt++;
+	info->shadow[id].urb = urb;
+	info->shadow[id].req = *req;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->urb_ring, notify);
+	if (notify)
+		notify_remote_via_irq(info->irq);
+
+	return ret;
+}
+
+static void xenhcd_kick_pending_urbs(struct usbfront_info *info)
+{
+	struct urb_priv *urbp;
+	int ret;
+
+	while (!list_empty(&info->pending_submit_list)) {
+		if (RING_FULL(&info->urb_ring)) {
+			COUNT(info->stats.ring_full);
+			timer_action(info, TIMER_RING_WATCHDOG);
+			goto done;
+		}
+
+		urbp = list_entry(info->pending_submit_list.next, struct urb_priv, list);
+		ret = xenhcd_do_request(info, urbp);
+		if (ret == 0)
+			list_move_tail(&urbp->list, &info->in_progress_list);
+		else
+			xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN);
+	}
+	timer_action_done(info, TIMER_SCAN_PENDING_URBS);
+
+done:
+	return;
+}
+
+/*
+ * caller must lock info->lock
+ */
+static void xenhcd_cancel_all_enqueued_urbs(struct usbfront_info *info)
+{
+	struct urb_priv *urbp, *tmp;
+
+	list_for_each_entry_safe(urbp, tmp, &info->in_progress_list, list) {
+		if (!urbp->unlinked) {
+			xenhcd_gnttab_done(&info->shadow[urbp->req_id]);
+			barrier();
+			if (urbp->urb->status == -EINPROGRESS)	/* not dequeued */
+				xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN);
+			else					/* dequeued */
+				xenhcd_giveback_urb(info, urbp->urb, urbp->urb->status);
+		}
+		info->shadow[urbp->req_id].urb = NULL;
+	}
+
+	list_for_each_entry_safe(urbp, tmp, &info->pending_submit_list, list) {
+		xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN);
+	}
+
+	return;
+}
+
+/*
+ * caller must lock info->lock
+ */
+static void xenhcd_giveback_unlinked_urbs(struct usbfront_info *info)
+{
+	struct urb_priv *urbp, *tmp;
+
+	list_for_each_entry_safe(urbp, tmp, &info->giveback_waiting_list, list) {
+		xenhcd_giveback_urb(info, urbp->urb, urbp->urb->status);
+	}
+}
+
+static int xenhcd_submit_urb(struct usbfront_info *info, struct urb_priv *urbp)
+{
+	int ret = 0;
+
+	if (RING_FULL(&info->urb_ring)) {
+		list_add_tail(&urbp->list, &info->pending_submit_list);
+		COUNT(info->stats.ring_full);
+		timer_action(info, TIMER_RING_WATCHDOG);
+		goto done;
+	}
+
+	if (!list_empty(&info->pending_submit_list)) {
+		list_add_tail(&urbp->list, &info->pending_submit_list);
+		timer_action(info, TIMER_SCAN_PENDING_URBS);
+		goto done;
+	}
+
+	ret = xenhcd_do_request(info, urbp);
+	if (ret == 0)
+		list_add_tail(&urbp->list, &info->in_progress_list);
+
+done:
+	return ret;
+}
+
+static int xenhcd_unlink_urb(struct usbfront_info *info, struct urb_priv *urbp)
+{
+	int ret = 0;
+
+	/* already unlinked? */
+	if (urbp->unlinked)
+		return -EBUSY;
+
+	urbp->unlinked = 1;
+
+	/* the urb is still in pending_submit queue */
+	if (urbp->req_id == ~0) {
+		list_move_tail(&urbp->list, &info->giveback_waiting_list);
+		timer_action(info, TIMER_SCAN_PENDING_URBS);
+		goto done;
+	}
+
+	/* send unlink request to backend */
+	if (RING_FULL(&info->urb_ring)) {
+		list_move_tail(&urbp->list, &info->pending_unlink_list);
+		COUNT(info->stats.ring_full);
+		timer_action(info, TIMER_RING_WATCHDOG);
+		goto done;
+	}
+
+	if (!list_empty(&info->pending_unlink_list)) {
+		list_move_tail(&urbp->list, &info->pending_unlink_list);
+		timer_action(info, TIMER_SCAN_PENDING_URBS);
+		goto done;
+	}
+
+	ret = xenhcd_do_request(info, urbp);
+	if (ret == 0)
+		list_move_tail(&urbp->list, &info->in_progress_list);
+
+done:
+	return ret;
+}
+
+static int xenhcd_urb_request_done(struct usbfront_info *info)
+{
+	usbif_urb_response_t *res;
+	struct urb *urb;
+
+	RING_IDX i, rp;
+	uint16_t id;
+	int more_to_do = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock, flags);
+
+	rp = info->urb_ring.sring->rsp_prod;
+	rmb(); /* ensure we see queued responses up to "rp" */
+
+	for (i = info->urb_ring.rsp_cons; i != rp; i++) {
+		res = RING_GET_RESPONSE(&info->urb_ring, i);
+		id = res->id;
+
+		if (likely(usbif_pipesubmit(info->shadow[id].req.pipe))) {
+			xenhcd_gnttab_done(&info->shadow[id]);
+			urb = info->shadow[id].urb;
+			barrier();
+			if (likely(urb)) {
+				urb->actual_length = res->actual_length;
+				urb->error_count = res->error_count;
+				urb->start_frame = res->start_frame;
+				barrier();
+				xenhcd_giveback_urb(info, urb, res->status);
+			}
+		}
+
+		add_id_to_freelist(info, id);
+	}
+	info->urb_ring.rsp_cons = i;
+
+	if (i != info->urb_ring.req_prod_pvt)
+		RING_FINAL_CHECK_FOR_RESPONSES(&info->urb_ring, more_to_do);
+	else
+		info->urb_ring.sring->rsp_event = i + 1;
+
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	cond_resched();
+
+	return more_to_do;
+}
+
+static int xenhcd_conn_notify(struct usbfront_info *info)
+{
+	usbif_conn_response_t *res;
+	usbif_conn_request_t *req;
+	RING_IDX rc, rp;
+	uint16_t id;
+	uint8_t portnum, speed;
+	int more_to_do = 0;
+	int notify;
+	int port_changed = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->lock, flags);
+
+	rc = info->conn_ring.rsp_cons;
+	rp = info->conn_ring.sring->rsp_prod;
+	rmb(); /* ensure we see queued responses up to "rp" */
+
+	while (rc != rp) {
+		res = RING_GET_RESPONSE(&info->conn_ring, rc);
+		id = res->id;
+		portnum = res->portnum;
+		speed = res->speed;
+		info->conn_ring.rsp_cons = ++rc;
+
+		rhport_connect(info, portnum, speed);
+		if (info->ports[portnum-1].c_connection)
+			port_changed = 1;
+
+		barrier();
+
+		req = RING_GET_REQUEST(&info->conn_ring, info->conn_ring.req_prod_pvt);
+		req->id = id;
+		info->conn_ring.req_prod_pvt++;
+	}
+
+	if (rc != info->conn_ring.req_prod_pvt)
+		RING_FINAL_CHECK_FOR_RESPONSES(&info->conn_ring, more_to_do);
+	else
+		info->conn_ring.sring->rsp_event = rc + 1;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->conn_ring, notify);
+	if (notify)
+		notify_remote_via_irq(info->irq);
+
+	spin_unlock_irqrestore(&info->lock, flags);
+
+	if (port_changed)
+		usb_hcd_poll_rh_status(info_to_hcd(info));
+
+	cond_resched();
+
+	return more_to_do;
+}
+
+int xenhcd_schedule(void *arg)
+{
+	struct usbfront_info *info = (struct usbfront_info *) arg;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(
+				info->wq,
+				info->waiting_resp || kthread_should_stop());
+		info->waiting_resp = 0;
+		smp_mb();
+
+		if (xenhcd_urb_request_done(info))
+			info->waiting_resp = 1;
+
+		if (xenhcd_conn_notify(info))
+			info->waiting_resp = 1;
+	}
+
+	return 0;
+}
+
+static void xenhcd_notify_work(struct usbfront_info *info)
+{
+	info->waiting_resp = 1;
+	wake_up(&info->wq);
+}
+
+irqreturn_t xenhcd_int(int irq, void *dev_id)
+{
+	xenhcd_notify_work((struct usbfront_info *) dev_id);
+	return IRQ_HANDLED;
+}
diff --git a/drivers/xen/usbfront/usbfront.h b/drivers/xen/usbfront/usbfront.h
new file mode 100644
index 0000000..a260114
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront.h
@@ -0,0 +1,197 @@
+/*
+ * usbfront.h
+ *
+ * This file is part of Xen USB Virtual Host Controller driver.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_USBFRONT_H__
+#define __XEN_USBFRONT_H__
+
+#include <linux/module.h>
+#include <linux/usb.h>
+#include <linux/list.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/usb/hcd.h>
+#include <asm/io.h>
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/io/usbif.h>
+
+static inline struct usbfront_info *hcd_to_info(struct usb_hcd *hcd)
+{
+	return (struct usbfront_info *) (hcd->hcd_priv);
+}
+
+static inline struct usb_hcd *info_to_hcd(struct usbfront_info *info)
+{
+	return container_of((void *) info, struct usb_hcd, hcd_priv);
+}
+
+/* Private per-URB data */
+struct urb_priv {
+	struct list_head list;
+	struct urb *urb;
+	int req_id;	/* RING_REQUEST id for submitting */
+	int unlink_req_id; /* RING_REQUEST id for unlinking */
+	int status;
+	unsigned unlinked:1; /* dequeued marker */
+};
+
+/* virtual roothub port status */
+struct rhport_status {
+	u32 status;
+	unsigned resuming:1; /* in resuming */
+	unsigned c_connection:1; /* connection changed */
+	unsigned long timeout;
+};
+
+/* status of attached device */
+struct vdevice_status {
+	int devnum;
+	enum usb_device_state status;
+	enum usb_device_speed speed;
+};
+
+/* RING request shadow */
+struct usb_shadow {
+	usbif_urb_request_t req;
+	struct urb *urb;
+};
+
+/* statistics for tuning, monitoring, ... */
+struct xenhcd_stats {
+	unsigned long ring_full; /* RING_FULL conditions */
+	unsigned long complete; /* normal givebacked urbs */
+	unsigned long unlink; /* unlinked urbs */
+};
+
+struct usbfront_info {
+	/* Virtual Host Controller has 4 urb queues */
+	struct list_head pending_submit_list;
+	struct list_head pending_unlink_list;
+	struct list_head in_progress_list;
+	struct list_head giveback_waiting_list;
+
+	spinlock_t lock;
+
+	/* timer that kick pending and giveback waiting urbs */
+	struct timer_list watchdog;
+	unsigned long actions;
+
+	/* virtual root hub */
+	int rh_numports;
+	struct rhport_status ports[USB_MAXCHILDREN];
+	struct vdevice_status devices[USB_MAXCHILDREN];
+
+	/* Xen related staff */
+	struct xenbus_device *xbdev;
+	int urb_ring_ref;
+	int conn_ring_ref;
+	usbif_urb_front_ring_t urb_ring;
+	usbif_conn_front_ring_t conn_ring;
+
+	unsigned int irq; /* event channel */
+	struct usb_shadow shadow[USB_URB_RING_SIZE];
+	unsigned long shadow_free;
+
+	/* RING_RESPONSE thread */
+	struct task_struct *kthread;
+	wait_queue_head_t wq;
+	unsigned int waiting_resp;
+
+	/* xmit statistics */
+#ifdef XENHCD_STATS
+	struct xenhcd_stats stats;
+#define COUNT(x) do { (x)++; } while (0)
+#else
+#define COUNT(x) do {} while (0)
+#endif
+};
+
+#define XENHCD_RING_JIFFIES (HZ/200)
+#define XENHCD_SCAN_JIFFIES 1
+
+enum xenhcd_timer_action {
+	TIMER_RING_WATCHDOG,
+	TIMER_SCAN_PENDING_URBS,
+};
+
+static inline void
+timer_action_done(struct usbfront_info *info, enum xenhcd_timer_action action)
+{
+	clear_bit(action, &info->actions);
+}
+
+static inline void
+timer_action(struct usbfront_info *info, enum xenhcd_timer_action action)
+{
+	if (timer_pending(&info->watchdog)
+			&& test_bit(TIMER_SCAN_PENDING_URBS, &info->actions))
+		return;
+
+	if (!test_and_set_bit(action, &info->actions)) {
+		unsigned long t;
+
+		switch (action) {
+		case TIMER_RING_WATCHDOG:
+			t = XENHCD_RING_JIFFIES;
+			break;
+		default:
+			t = XENHCD_SCAN_JIFFIES;
+			break;
+		}
+		mod_timer(&info->watchdog, t + jiffies);
+	}
+}
+
+extern struct kmem_cache *xenhcd_urbp_cachep;
+extern struct hc_driver xen_usb20_hc_driver;
+extern struct hc_driver xen_usb11_hc_driver;
+irqreturn_t xenhcd_int(int irq, void *dev_id);
+void xenhcd_rhport_state_change(struct usbfront_info *info,
+				int port, enum usb_device_speed speed);
+int xenhcd_schedule(void *arg);
+
+#endif /* __XEN_USBFRONT_H__ */
diff --git a/drivers/xen/usbfront/xenbus.c b/drivers/xen/usbfront/xenbus.c
new file mode 100644
index 0000000..fc6969a
--- /dev/null
+++ b/drivers/xen/usbfront/xenbus.c
@@ -0,0 +1,414 @@
+/*
+ * xenbus.c
+ *
+ * Xenbus interface for Xen USB Virtual Host Controller
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "usbfront.h"
+
+#define GRANT_INVALID_REF 0
+
+static void destroy_rings(struct usbfront_info *info)
+{
+	if (info->irq)
+		unbind_from_irqhandler(info->irq, info);
+	info->irq = 0;
+
+	if (info->urb_ring_ref != GRANT_INVALID_REF) {
+		gnttab_end_foreign_access(info->urb_ring_ref,
+					  (unsigned long)info->urb_ring.sring);
+		info->urb_ring_ref = GRANT_INVALID_REF;
+	}
+	info->urb_ring.sring = NULL;
+
+	if (info->conn_ring_ref != GRANT_INVALID_REF) {
+		gnttab_end_foreign_access(info->conn_ring_ref,
+					  (unsigned long)info->conn_ring.sring);
+		info->conn_ring_ref = GRANT_INVALID_REF;
+	}
+	info->conn_ring.sring = NULL;
+}
+
+static int setup_rings(struct xenbus_device *dev,
+			   struct usbfront_info *info)
+{
+	usbif_urb_sring_t *urb_sring;
+	usbif_conn_sring_t *conn_sring;
+	int err;
+
+	info->urb_ring_ref = GRANT_INVALID_REF;
+	info->conn_ring_ref = GRANT_INVALID_REF;
+
+	urb_sring = (usbif_urb_sring_t *)get_zeroed_page(GFP_NOIO|__GFP_HIGH);
+	if (!urb_sring) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating urb ring");
+		return -ENOMEM;
+	}
+	SHARED_RING_INIT(urb_sring);
+	FRONT_RING_INIT(&info->urb_ring, urb_sring, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(info->urb_ring.sring));
+	if (err < 0) {
+		free_page((unsigned long)urb_sring);
+		info->urb_ring.sring = NULL;
+		goto fail;
+	}
+	info->urb_ring_ref = err;
+
+	conn_sring = (usbif_conn_sring_t *)get_zeroed_page(GFP_NOIO|__GFP_HIGH);
+	if (!conn_sring) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating conn ring");
+		return -ENOMEM;
+	}
+	SHARED_RING_INIT(conn_sring);
+	FRONT_RING_INIT(&info->conn_ring, conn_sring, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(info->conn_ring.sring));
+	if (err < 0) {
+		free_page((unsigned long)conn_sring);
+		info->conn_ring.sring = NULL;
+		goto fail;
+	}
+	info->conn_ring_ref = err;
+
+	err = bind_listening_port_to_irqhandler(
+		dev->otherend_id, xenhcd_int, IRQF_SAMPLE_RANDOM, "usbif", info);
+	if (err <= 0) {
+		xenbus_dev_fatal(dev, err,
+				 "bind_listening_port_to_irqhandler");
+		goto fail;
+	}
+	info->irq = err;
+
+	return 0;
+fail:
+	destroy_rings(info);
+	return err;
+}
+
+static int talk_to_backend(struct xenbus_device *dev,
+			   struct usbfront_info *info)
+{
+	const char *message;
+	struct xenbus_transaction xbt;
+	int err;
+
+	err = setup_rings(dev, info);
+	if (err)
+		goto out;
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_ring;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "urb-ring-ref", "%u",
+			    info->urb_ring_ref);
+	if (err) {
+		message = "writing urb-ring-ref";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "conn-ring-ref", "%u",
+			    info->conn_ring_ref);
+	if (err) {
+		message = "writing conn-ring-ref";
+		goto abort_transaction;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+			    irq_to_evtchn_port(info->irq));
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_ring;
+	}
+
+	return 0;
+
+abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	xenbus_dev_fatal(dev, err, "%s", message);
+
+destroy_ring:
+	destroy_rings(info);
+
+out:
+	return err;
+}
+
+static int connect(struct xenbus_device *dev)
+{
+	struct usbfront_info *info = dev_get_drvdata(&dev->dev);
+
+	usbif_conn_request_t *req;
+	int i, idx, err;
+	int notify;
+	char name[TASK_COMM_LEN];
+	struct usb_hcd *hcd;
+
+	hcd = info_to_hcd(info);
+	snprintf(name, TASK_COMM_LEN, "xenhcd.%d", hcd->self.busnum);
+
+	err = talk_to_backend(dev, info);
+	if (err)
+		return err;
+
+	info->kthread = kthread_run(xenhcd_schedule, info, name);
+	if (IS_ERR(info->kthread)) {
+		err = PTR_ERR(info->kthread);
+		info->kthread = NULL;
+		xenbus_dev_fatal(dev, err, "Error creating thread");
+		return err;
+	}
+	/* prepare ring for hotplug notification */
+	for (idx = 0, i = 0; i < USB_CONN_RING_SIZE; i++) {
+		req = RING_GET_REQUEST(&info->conn_ring, idx);
+		req->id = idx;
+		idx++;
+	}
+	info->conn_ring.req_prod_pvt = idx;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->conn_ring, notify);
+	if (notify)
+		notify_remote_via_irq(info->irq);
+
+	return 0;
+}
+
+static struct usb_hcd *create_hcd(struct xenbus_device *dev)
+{
+	int i;
+	int err = 0;
+	int num_ports;
+	int usb_ver;
+	struct usb_hcd *hcd = NULL;
+	struct usbfront_info *info = NULL;
+
+	err = xenbus_scanf(XBT_NIL, dev->otherend,
+					"num-ports", "%d", &num_ports);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading num-ports");
+		return ERR_PTR(-EINVAL);
+	}
+	if (num_ports < 1 || num_ports > USB_MAXCHILDREN) {
+		xenbus_dev_fatal(dev, err, "invalid num-ports");
+		return ERR_PTR(-EINVAL);
+	}
+
+	err = xenbus_scanf(XBT_NIL, dev->otherend,
+					"usb-ver", "%d", &usb_ver);
+	if (err != 1) {
+		xenbus_dev_fatal(dev, err, "reading usb-ver");
+		return ERR_PTR(-EINVAL);
+	}
+	switch (usb_ver) {
+	case USB_VER_USB11:
+		hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev_name(&dev->dev));
+		break;
+	case USB_VER_USB20:
+		hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev_name(&dev->dev));
+		break;
+	default:
+		xenbus_dev_fatal(dev, err, "invalid usb-ver");
+		return ERR_PTR(-EINVAL);
+	}
+	if (!hcd) {
+		xenbus_dev_fatal(dev, err,
+				"fail to allocate USB host controller");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	info = hcd_to_info(hcd);
+	info->xbdev = dev;
+	info->rh_numports = num_ports;
+
+	for (i = 0; i < USB_URB_RING_SIZE; i++) {
+		info->shadow[i].req.id = i + 1;
+		info->shadow[i].urb = NULL;
+	}
+	info->shadow[USB_URB_RING_SIZE-1].req.id = 0x0fff;
+
+	return hcd;
+}
+
+static int usbfront_probe(struct xenbus_device *dev,
+			  const struct xenbus_device_id *id)
+{
+	int err;
+	struct usb_hcd *hcd;
+	struct usbfront_info *info;
+
+	if (usb_disabled())
+		return -ENODEV;
+
+	hcd = create_hcd(dev);
+	if (IS_ERR(hcd)) {
+		err = PTR_ERR(hcd);
+		xenbus_dev_fatal(dev, err,
+				"fail to create usb host controller");
+		goto fail;
+	}
+
+	info = hcd_to_info(hcd);
+	dev_set_drvdata(&dev->dev, info);
+
+	err = usb_add_hcd(hcd, 0, 0);
+	if (err != 0) {
+		xenbus_dev_fatal(dev, err,
+				"fail to adding USB host controller");
+		goto fail;
+	}
+
+	init_waitqueue_head(&info->wq);
+
+	return 0;
+
+fail:
+	usb_put_hcd(hcd);
+	dev_set_drvdata(&dev->dev, NULL);
+	return err;
+}
+
+static void usbfront_disconnect(struct xenbus_device *dev)
+{
+	struct usbfront_info *info = dev_get_drvdata(&dev->dev);
+	struct usb_hcd *hcd = info_to_hcd(info);
+
+	usb_remove_hcd(hcd);
+	if (info->kthread) {
+		kthread_stop(info->kthread);
+		info->kthread = NULL;
+	}
+	xenbus_frontend_closed(dev);
+}
+
+static void backend_changed(struct xenbus_device *dev,
+				     enum xenbus_state backend_state)
+{
+	switch (backend_state) {
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+	case XenbusStateUnknown:
+	case XenbusStateClosed:
+		break;
+
+	case XenbusStateInitWait:
+		if (dev->state != XenbusStateInitialising)
+			break;
+		if (!connect(dev))
+			xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosing:
+		usbfront_disconnect(dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 backend_state);
+		break;
+	}
+}
+
+static int usbfront_remove(struct xenbus_device *dev)
+{
+	struct usbfront_info *info = dev_get_drvdata(&dev->dev);
+	struct usb_hcd *hcd = info_to_hcd(info);
+
+	destroy_rings(info);
+	usb_put_hcd(hcd);
+
+	return 0;
+}
+
+static const struct xenbus_device_id usbfront_ids[] = {
+	{ "vusb" },
+	{ "" },
+};
+MODULE_ALIAS("xen:vusb");
+
+static DEFINE_XENBUS_DRIVER(usbfront, ,
+	.probe = usbfront_probe,
+	.otherend_changed = backend_changed,
+	.remove = usbfront_remove,
+);
+
+static int __init usbfront_init(void)
+{
+	if (!is_running_on_xen())
+		return -ENODEV;
+
+	xenhcd_urbp_cachep = kmem_cache_create("xenhcd_urb_priv",
+			sizeof(struct urb_priv), 0, 0, NULL);
+	if (!xenhcd_urbp_cachep) {
+		pr_err("usbfront failed to create kmem cache\n");
+		return -ENOMEM;
+	}
+
+	return xenbus_register_frontend(&usbfront_driver);
+}
+
+static void __exit usbfront_exit(void)
+{
+	kmem_cache_destroy(xenhcd_urbp_cachep);
+	xenbus_unregister_driver(&usbfront_driver);
+}
+
+module_init(usbfront_init);
+module_exit(usbfront_exit);
+
+MODULE_AUTHOR("");
+MODULE_DESCRIPTION("Xen USB Virtual Host Controller driver (usbfront)");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/util.c b/drivers/xen/util.c
new file mode 100644
index 0000000..412f19a
--- /dev/null
+++ b/drivers/xen/util.c
@@ -0,0 +1,74 @@
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <xen/driver_util.h>
+
+static struct class *_get_xen_class(void)
+{
+	static struct class *xen_class;
+	static DEFINE_MUTEX(xc_mutex);
+
+	mutex_lock(&xc_mutex);
+	if (IS_ERR_OR_NULL(xen_class))
+		xen_class = class_create(THIS_MODULE, "xen");
+	mutex_unlock(&xc_mutex);
+	if (IS_ERR(xen_class))
+		pr_err("failed to create xen sysfs class\n");
+
+	return xen_class;
+}
+
+struct class *get_xen_class(void)
+{
+	struct class *class = _get_xen_class();
+
+	return !IS_ERR(class) ? class : NULL;
+}
+EXPORT_SYMBOL_GPL(get_xen_class);
+
+static void xcdev_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+struct device *xen_class_device_create(struct device_type *type,
+				       struct device *parent,
+				       dev_t devt, void *drvdata,
+				       const char *fmt, ...)
+{
+	struct device *dev;
+	int err;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (dev) {
+		va_list vargs;
+
+		va_start(vargs, fmt);
+		err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
+		va_end(vargs);
+	} else
+		err = -ENOMEM;
+
+	if (!err) {
+		dev->devt = devt;
+		dev->class = _get_xen_class();
+		if (IS_ERR(dev->class))
+			err = PTR_ERR(dev->class);
+	}
+
+	if (!err) {
+		dev->type = type;
+		dev->parent = parent;
+		dev_set_drvdata(dev, drvdata);
+		dev->release = xcdev_release;
+		err = device_register(dev);
+		if (!err)
+			return dev;
+		put_device(dev);
+	} else
+		kfree(dev);
+
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(xen_class_device_create);
diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile
index ffe0ad3..34a40ce 100644
--- a/drivers/xen/xen-pciback/Makefile
+++ b/drivers/xen/xen-pciback/Makefile
@@ -1,7 +1,13 @@
-obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
+pcibk-$(CONFIG_PARAVIRT_XEN) := xen-pciback
+pcibk-$(CONFIG_XEN) := pciback
 
-xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
-xen-pciback-y += conf_space.o conf_space_header.o \
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) := $(pcibk-y).o
+
+$(pcibk-y)-y := pci_stub.o pciback_ops.o xenbus.o
+$(pcibk-y)-y += conf_space.o conf_space_header.o \
 		 conf_space_capability.o \
-		 conf_space_quirks.o vpci.o \
-		 passthrough.o
+		 conf_space_quirks.o
+$(pcibk-y)-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
+$(pcibk-y)-$(CONFIG_XEN_PCIDEV_BACKEND_PASSTHROUGH) += passthrough.o
+$(pcibk-y)-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
+$(pcibk-y)-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
index 7f83e90..89d6e90 100644
--- a/drivers/xen/xen-pciback/conf_space_capability.c
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
@@ -140,6 +140,21 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
 		goto out;
 	}
 
+#ifdef CONFIG_XEN
+	/*
+	 * Device may lose PCI config info on D3->D0 transition. This
+	 * is a problem for some guests which will not reset BARs. Even
+	 * those that have a go will be foiled by our BAR-write handler
+	 * which will discard the write! Since Linux won't re-init
+	 * the config space automatically in all cases, we do it here.
+	 * Future: Should we re-initialise all first 64 bytes of config space?
+	 */
+	if (new_state == PCI_D0 &&
+	    (old_state == PCI_D3hot || old_state == PCI_D3cold) &&
+	    !(old_value & PCI_PM_CTRL_NO_SOFT_RESET))
+		pci_restore_bars(dev);
+#endif
+
  out:
 	return err;
 }
diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c
index 3daf862..cdf1dfb 100644
--- a/drivers/xen/xen-pciback/conf_space_header.c
+++ b/drivers/xen/xen-pciback/conf_space_header.c
@@ -39,10 +39,11 @@ static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
 
 static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
 {
-	struct xen_pcibk_dev_data *dev_data;
+#ifndef CONFIG_XEN
+	struct xen_pcibk_dev_data *dev_data = dev_data = pci_get_drvdata(dev);
+#endif
 	int err;
 
-	dev_data = pci_get_drvdata(dev);
 	if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
 		if (unlikely(verbose_request))
 			printk(KERN_DEBUG DRV_NAME ": %s: enable\n",
@@ -50,15 +51,19 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
 		err = pci_enable_device(dev);
 		if (err)
 			return err;
+#ifndef CONFIG_XEN
 		if (dev_data)
 			dev_data->enable_intx = 1;
+#endif
 	} else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
 		if (unlikely(verbose_request))
 			printk(KERN_DEBUG DRV_NAME ": %s: disable\n",
 			       pci_name(dev));
 		pci_disable_device(dev);
+#ifndef CONFIG_XEN
 		if (dev_data)
 			dev_data->enable_intx = 0;
+#endif
 	}
 
 	if (!dev->is_busmaster && is_master_cmd(value)) {
diff --git a/drivers/xen/xen-pciback/controller.c b/drivers/xen/xen-pciback/controller.c
new file mode 100644
index 0000000..550f5c2
--- /dev/null
+++ b/drivers/xen/xen-pciback/controller.c
@@ -0,0 +1,450 @@
+/*
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ *      Alex Williamson <alex.williamson@hp.com>
+ *
+ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
+ * controllers.  Devices under the same PCI controller are exposed on the
+ * same virtual domain:bus.  Within a bus, device slots are virtualized
+ * to compact the bus.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/acpi.h>
+#include "pciback.h"
+
+#define PCI_MAX_BUSSES	255
+#define PCI_MAX_SLOTS	32
+
+struct controller_dev_entry {
+	struct list_head list;
+	struct pci_dev *dev;
+	unsigned int devfn;
+};
+
+struct controller_list_entry {
+	struct list_head list;
+	struct pci_controller *controller;
+	unsigned int domain;
+	unsigned int bus;
+	unsigned int next_devfn;
+	struct list_head dev_list;
+};
+
+struct controller_dev_data {
+	struct list_head list;
+	unsigned int next_domain;
+	unsigned int next_bus;
+	spinlock_t lock;
+};
+
+struct walk_info {
+	struct xen_pcibk_device *pdev;
+	int resource_count;
+	int root_num;
+};
+
+static struct pci_dev *_xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+					      unsigned int domain,
+					      unsigned int bus,
+					      unsigned int devfn)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_dev_entry *dev_entry;
+	struct controller_list_entry *cntrl_entry;
+	struct pci_dev *dev = NULL;
+
+	mutex_lock(&dev_data->lock);
+
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		if (cntrl_entry->domain != domain ||
+		    cntrl_entry->bus != bus)
+			continue;
+
+		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+			if (devfn == dev_entry->devfn) {
+				dev = dev_entry->dev;
+				goto found;
+			}
+		}
+	}
+found:
+	mutex_unlock(&dev_data->lock);
+
+	return dev;
+}
+
+static int _xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+				  struct pci_dev *dev, int devid,
+				  publish_pci_dev_cb publish_cb)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_dev_entry *dev_entry;
+	struct controller_list_entry *cntrl_entry;
+	struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
+	int ret = 0, found = 0;
+
+	mutex_lock(&dev_data->lock);
+
+	/* Look to see if we already have a domain:bus for this controller */
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		if (cntrl_entry->controller == dev_controller) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
+		if (!cntrl_entry) {
+			ret =  -ENOMEM;
+			goto out;
+		}
+
+		cntrl_entry->controller = dev_controller;
+		cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
+
+		cntrl_entry->domain = dev_data->next_domain;
+		cntrl_entry->bus = dev_data->next_bus++;
+		if (dev_data->next_bus > PCI_MAX_BUSSES) {
+			dev_data->next_domain++;
+			dev_data->next_bus = 0;
+		}
+
+		INIT_LIST_HEAD(&cntrl_entry->dev_list);
+
+		list_add_tail(&cntrl_entry->list, &dev_data->list);
+	}
+
+	if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
+		/*
+		 * While it seems unlikely, this can actually happen if
+		 * a controller has P2P bridges under it.
+		 */
+		xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
+				 "is full, no room to export %04x:%02x:%02x.%x",
+				 cntrl_entry->domain, cntrl_entry->bus,
+				 pci_domain_nr(dev->bus), dev->bus->number,
+				 PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
+	if (!dev_entry) {
+		if (list_empty(&cntrl_entry->dev_list)) {
+			list_del(&cntrl_entry->list);
+			kfree(cntrl_entry);
+		}
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	dev_entry->dev = dev;
+	dev_entry->devfn = cntrl_entry->next_devfn;
+
+	list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
+
+	cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
+
+out:
+	mutex_unlock(&dev_data->lock);
+
+	/* TODO: Publish virtual domain:bus:slot.func here. */
+
+	return ret;
+}
+
+static void _xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+				       struct pci_dev *dev)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_list_entry *cntrl_entry;
+	struct controller_dev_entry *dev_entry = NULL;
+	struct pci_dev *found_dev = NULL;
+
+	mutex_lock(&dev_data->lock);
+
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		if (cntrl_entry->controller != PCI_CONTROLLER(dev))
+			continue;
+
+		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+			if (dev_entry->dev == dev) {
+				found_dev = dev_entry->dev;
+				break;
+			}
+		}
+	}
+
+	if (!found_dev) {
+		mutex_unlock(&dev_data->lock);
+		return;
+	}
+
+	list_del(&dev_entry->list);
+	kfree(dev_entry);
+
+	if (list_empty(&cntrl_entry->dev_list)) {
+		list_del(&cntrl_entry->list);
+		kfree(cntrl_entry);
+	}
+
+	mutex_unlock(&dev_data->lock);
+	pcistub_put_pci_dev(found_dev);
+}
+
+static int _xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+	struct controller_dev_data *dev_data;
+
+	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+	if (!dev_data)
+		return -ENOMEM;
+
+	mutex_init(&dev_data->lock);
+
+	INIT_LIST_HEAD(&dev_data->list);
+
+	/* Starting domain:bus numbers */
+	dev_data->next_domain = 0;
+	dev_data->next_bus = 0;
+
+	pdev->pci_dev_data = dev_data;
+
+	return 0;
+}
+
+static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
+{
+	struct walk_info *info = data;
+	struct acpi_resource_address64 addr;
+	acpi_status status;
+	int i, len, err;
+	char str[32], tmp[3];
+	unsigned char *ptr, *buf;
+
+	status = acpi_resource_to_address64(res, &addr);
+
+	/* Do we care about this range?  Let's check. */
+	if (!ACPI_SUCCESS(status) ||
+	    !(addr.resource_type == ACPI_MEMORY_RANGE ||
+	      addr.resource_type == ACPI_IO_RANGE) ||
+	    !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
+		return AE_OK;
+
+	/*
+	 * Furthermore, we really only care to tell the guest about
+	 * address ranges that require address translation of some sort.
+	 */
+	if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
+	      addr.info.mem.translation) &&
+	    !(addr.resource_type == ACPI_IO_RANGE &&
+	      addr.info.io.translation))
+		return AE_OK;
+	   
+	/* Store the resource in xenbus for the guest */
+	len = snprintf(str, sizeof(str), "root-%d-resource-%d",
+		       info->root_num, info->resource_count);
+	if (unlikely(len >= (sizeof(str) - 1)))
+		return AE_OK;
+
+	buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
+	if (!buf)
+		return AE_OK;
+
+	/* Clean out resource_source */
+	res->data.address64.resource_source.index = 0xFF;
+	res->data.address64.resource_source.string_length = 0;
+	res->data.address64.resource_source.string_ptr = NULL;
+
+	ptr = (unsigned char *)res;
+
+	/* Turn the acpi_resource into an ASCII byte stream */
+	for (i = 0; i < sizeof(*res); i++) {
+		snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
+		strncat(buf, tmp, 2);
+	}
+
+	err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
+			    str, "%s", buf);
+
+	if (!err)
+		info->resource_count++;
+
+	kfree(buf);
+
+	return AE_OK;
+}
+
+static int _xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+					publish_pci_root_cb publish_root_cb)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_list_entry *cntrl_entry;
+	int i, root_num, len, err = 0;
+	unsigned int domain, bus;
+	char str[64];
+	struct walk_info info;
+
+	mutex_lock(&dev_data->lock);
+
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		/* First publish all the domain:bus info */
+		err = publish_root_cb(pdev, cntrl_entry->domain,
+				      cntrl_entry->bus);
+		if (err)
+			goto out;
+
+		/*
+ 		 * Now figure out which root-%d this belongs to
+		 * so we can associate resources with it.
+		 */
+		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+				   "root_num", "%d", &root_num);
+
+		if (err != 1)
+			goto out;
+
+		for (i = 0; i < root_num; i++) {
+			len = snprintf(str, sizeof(str), "root-%d", i);
+			if (unlikely(len >= (sizeof(str) - 1))) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+					   str, "%x:%x", &domain, &bus);
+			if (err != 2)
+				goto out;
+
+			/* Is this the one we just published? */
+			if (domain == cntrl_entry->domain &&
+			    bus == cntrl_entry->bus)
+				break;
+		}
+
+		if (i == root_num)
+			goto out;
+
+		info.pdev = pdev;
+		info.resource_count = 0;
+		info.root_num = i;
+
+		/* Let ACPI do the heavy lifting on decoding resources */
+		acpi_walk_resources(cntrl_entry->controller->acpi_handle,
+				    METHOD_NAME__CRS, write_xenbus_resource,
+				    &info);
+
+		/* No resouces.  OK.  On to the next one */
+		if (!info.resource_count)
+			continue;
+
+		/* Store the number of resources we wrote for this root-%d */
+		len = snprintf(str, sizeof(str), "root-%d-resources", i);
+		if (unlikely(len >= (sizeof(str) - 1))) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+				    "%d", info.resource_count);
+		if (err)
+			goto out;
+	}
+
+	/* Finally, write some magic to synchronize with the guest. */
+	len = snprintf(str, sizeof(str), "root-resource-magic");
+	if (unlikely(len >= (sizeof(str) - 1))) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+			    "%lx", (sizeof(struct acpi_resource) * 2) + 1);
+
+out:
+	mutex_unlock(&dev_data->lock);
+
+	return err;
+}
+
+static void _xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_list_entry *cntrl_entry, *c;
+	struct controller_dev_entry *dev_entry, *d;
+
+	list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
+		list_for_each_entry_safe(dev_entry, d,
+					 &cntrl_entry->dev_list, list) {
+			list_del(&dev_entry->list);
+			pcistub_put_pci_dev(dev_entry->dev);
+			kfree(dev_entry);
+		}
+		list_del(&cntrl_entry->list);
+		kfree(cntrl_entry);
+	}
+
+	kfree(dev_data);
+	pdev->pci_dev_data = NULL;
+}
+
+static int _xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+				       struct xen_pcibk_device *pdev,
+				       unsigned int *domain,
+				       unsigned int *bus, unsigned int *devfn)
+{
+	struct controller_dev_data *dev_data = pdev->pci_dev_data;
+	struct controller_dev_entry *dev_entry;
+	struct controller_list_entry *cntrl_entry;
+	int found = 0;
+
+	mutex_lock(&dev_data->lock);
+	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+			if ( (dev_entry->dev->bus->number == 
+					pcidev->bus->number) &&
+			  	(dev_entry->dev->devfn ==
+					pcidev->devfn) &&
+				(pci_domain_nr(dev_entry->dev->bus) ==
+					pci_domain_nr(pcidev->bus)))
+			{
+				found = 1;
+				*domain = cntrl_entry->domain;
+				*bus = cntrl_entry->bus;
+				*devfn = dev_entry->devfn;
+				goto out;
+			}
+		}
+	}
+out:
+	mutex_unlock(&dev_data->lock);
+	return found;
+
+}
+
+const struct xen_pcibk_backend xen_pcibk_controller_backend = {
+	.name		= "controller",
+	.init		= _xen_pcibk_init_devices,
+	.free		= _xen_pcibk_release_devices,
+	.find		= _xen_pcibk_get_pcifront_dev,
+	.publish	= _xen_pcibk_publish_pci_roots,
+	.release	= _xen_pcibk_release_pci_dev,
+	.add		= _xen_pcibk_add_pci_dev,
+	.get		= _xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
index 19834d1..e3897ee 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -14,9 +14,14 @@
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/atomic.h>
+#ifndef CONFIG_XEN
 #include <xen/events.h>
 #include <asm/xen/pci.h>
 #include <asm/xen/hypervisor.h>
+#else
+#include <xen/evtchn.h>
+#endif
+#include <xen/xen.h>
 #include "pciback.h"
 #include "conf_space.h"
 #include "conf_space_quirks.h"
@@ -90,7 +95,9 @@ static void pcistub_device_release(struct kref *kref)
 
 	dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
 
+#ifndef CONFIG_XEN
 	xen_unregister_device_domain_owner(psdev->dev);
+#endif
 
 	/* Clean-up the device */
 	xen_pcibk_reset_device(psdev->dev);
@@ -235,7 +242,9 @@ void pcistub_put_pci_dev(struct pci_dev *dev)
 	xen_pcibk_config_free_dyn_fields(found_psdev->dev);
 	xen_pcibk_config_reset_dev(found_psdev->dev);
 
+#ifndef CONFIG_XEN
 	xen_unregister_device_domain_owner(found_psdev->dev);
+#endif
 
 	spin_lock_irqsave(&found_psdev->lock, flags);
 	found_psdev->pdev = NULL;
@@ -295,19 +304,25 @@ static int __devinit pcistub_init_device(struct pci_dev *dev)
 	 * would need to be called somewhere to free the memory allocated
 	 * here and then to call kfree(pci_get_drvdata(psdev->dev)).
 	 */
+#ifndef CONFIG_XEN
 	dev_data = kzalloc(sizeof(*dev_data) +  strlen(DRV_NAME "[]")
 				+ strlen(pci_name(dev)) + 1, GFP_ATOMIC);
+#else
+	dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
+#endif
 	if (!dev_data) {
 		err = -ENOMEM;
 		goto out;
 	}
 	pci_set_drvdata(dev, dev_data);
 
+#ifndef CONFIG_XEN
 	/*
 	 * Setup name for fake IRQ handler. It will only be enabled
 	 * once the device is turned on by the guest.
 	 */
 	sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev));
+#endif
 
 	dev_dbg(&dev->dev, "initializing config\n");
 
@@ -445,6 +460,16 @@ static int __devinit pcistub_probe(struct pci_dev *dev,
 
 		dev_info(&dev->dev, "seizing device\n");
 		err = pcistub_seize(dev);
+#ifdef CONFIG_PCI_GUESTDEV
+	} else if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
+		if (!pci_is_guestdev(dev)) {
+			err = -ENODEV;
+			goto out;
+		}
+
+		dev_info(&dev->dev, "seizing device\n");
+		err = pcistub_seize(dev);
+#endif /* CONFIG_PCI_GUESTDEV */
 	} else
 		/* Didn't find the device */
 		err = -ENODEV;
@@ -849,8 +874,10 @@ static struct pci_error_handlers xen_pcibk_error_handler = {
  */
 
 static struct pci_driver xen_pcibk_pci_driver = {
+#ifndef CONFIG_XEN
 	/* The name should be xen_pciback, but until the tools are updated
 	 * we will keep it as pciback. */
+#endif
 	.name = "pciback",
 	.id_table = pcistub_ids,
 	.probe = pcistub_probe,
@@ -1040,6 +1067,7 @@ static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
 }
 static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
 
+#ifndef CONFIG_XEN
 static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
 {
 	struct pcistub_device *psdev;
@@ -1105,6 +1133,7 @@ out:
 }
 static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL,
 		   pcistub_irq_handler_switch);
+#endif
 
 static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
 				 size_t count)
@@ -1237,6 +1266,21 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf)
 static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show,
 		   permissive_add);
 
+#if defined(CONFIG_XEN) && defined(CONFIG_PCI_MSI)
+static int xen_pcibk_get_owner(struct pci_dev *dev)
+{
+	struct pcistub_device *psdev;
+
+	psdev = pcistub_device_find(pci_domain_nr(dev->bus), dev->bus->number,
+			PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+
+	if (!psdev || !psdev->pdev)
+		return -1;
+
+	return psdev->pdev->xdev->otherend_id;
+}
+#endif
+
 static void pcistub_exit(void)
 {
 	driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot);
@@ -1246,10 +1290,14 @@ static void pcistub_exit(void)
 	driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks);
 	driver_remove_file(&xen_pcibk_pci_driver.driver,
 			   &driver_attr_permissive);
+#if !defined(CONFIG_XEN)
 	driver_remove_file(&xen_pcibk_pci_driver.driver,
 			   &driver_attr_irq_handlers);
 	driver_remove_file(&xen_pcibk_pci_driver.driver,
 			   &driver_attr_irq_handler_state);
+#elif defined(CONFIG_PCI_MSI)
+	WARN_ON(unregister_msi_get_owner(xen_pcibk_get_owner));
+#endif
 	pci_unregister_driver(&xen_pcibk_pci_driver);
 }
 
@@ -1308,12 +1356,17 @@ static int __init pcistub_init(void)
 		err = driver_create_file(&xen_pcibk_pci_driver.driver,
 					 &driver_attr_permissive);
 
+#if !defined(CONFIG_XEN)
 	if (!err)
 		err = driver_create_file(&xen_pcibk_pci_driver.driver,
 					 &driver_attr_irq_handlers);
 	if (!err)
 		err = driver_create_file(&xen_pcibk_pci_driver.driver,
 					&driver_attr_irq_handler_state);
+#elif defined(CONFIG_PCI_MSI)
+	if (!err)
+		err = register_msi_get_owner(xen_pcibk_get_owner);
+#endif
 	if (err)
 		pcistub_exit();
 
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
index e9b4011..701245f 100644
--- a/drivers/xen/xen-pciback/pciback.h
+++ b/drivers/xen/xen-pciback/pciback.h
@@ -10,12 +10,16 @@
 #include <linux/interrupt.h>
 #include <xen/xenbus.h>
 #include <linux/list.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/workqueue.h>
 #include <linux/atomic.h>
 #include <xen/interface/io/pciif.h>
 
+#ifndef CONFIG_XEN
 #define DRV_NAME	"xen-pciback"
+#else
+#define DRV_NAME	"pciback"
+#endif
 
 struct pci_dev_entry {
 	struct list_head list;
@@ -34,6 +38,9 @@ struct xen_pcibk_device {
 	struct xenbus_watch be_watch;
 	u8 be_watching;
 	int evtchn_irq;
+#ifdef CONFIG_XEN
+	struct vm_struct *sh_area;
+#endif
 	struct xen_pci_sharedinfo *sh_info;
 	unsigned long flags;
 	struct work_struct op_work;
@@ -43,12 +50,14 @@ struct xen_pcibk_dev_data {
 	struct list_head config_fields;
 	unsigned int permissive:1;
 	unsigned int warned_on_write:1;
+#ifndef CONFIG_XEN
 	unsigned int enable_intx:1;
 	unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
 	unsigned int ack_intr:1; /* .. and ACK-ing */
 	unsigned long handled;
 	unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
 	char irq_name[0]; /* xen-pcibk[000:04:00.0] */
+#endif
 };
 
 /* Used by XenBus and xen_pcibk_ops.c */
@@ -86,9 +95,11 @@ typedef int (*publish_pci_dev_cb) (struct xen_pcibk_device *pdev,
 typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev,
 				    unsigned int domain, unsigned int bus);
 
-/* Backend registration for the two types of BDF representation:
+/* Backend registration for the different types of BDF representation:
  *  vpci - BDFs start at 00
  *  passthrough - BDFs are exactly like in the host.
+ *  slot - like vpci, but each function becoming a separate slot
+ *  controller - devices on same host bus will also be on same virtual bus
  */
 struct xen_pcibk_backend {
 	const char *name;
@@ -106,8 +117,10 @@ struct xen_pcibk_backend {
 			       unsigned int devfn);
 };
 
-extern const struct xen_pcibk_backend xen_pcibk_vpci_backend;
-extern const struct xen_pcibk_backend xen_pcibk_passthrough_backend;
+extern const struct xen_pcibk_backend __weak xen_pcibk_vpci_backend;
+extern const struct xen_pcibk_backend __weak xen_pcibk_passthrough_backend;
+extern const struct xen_pcibk_backend __weak xen_pcibk_slot_backend;
+extern const struct xen_pcibk_backend __weak xen_pcibk_controller_backend;
 extern const struct xen_pcibk_backend *xen_pcibk_backend;
 
 static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index 63616d7..dd15c54 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -6,13 +6,18 @@
 #include <linux/module.h>
 #include <linux/wait.h>
 #include <linux/bitops.h>
+#ifndef CONFIG_XEN
 #include <xen/events.h>
+#else
+#include <xen/evtchn.h>
+#endif
 #include <linux/sched.h>
 #include "pciback.h"
 
 int verbose_request;
 module_param(verbose_request, int, 0644);
 
+#ifndef CONFIG_XEN
 static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id);
 
 /* Ensure a device is has the fake IRQ handler "turned on/off" and is
@@ -92,6 +97,7 @@ out:
 		enable ? (dev_data->isr_on ? "enabled" : "failed to enable") :
 			(dev_data->isr_on ? "failed to disable" : "disabled"));
 }
+#endif
 
 /* Ensure a device is "turned off" and ready to be exported.
  * (Also see xen_pcibk_config_reset to ensure virtual configuration space is
@@ -101,7 +107,9 @@ void xen_pcibk_reset_device(struct pci_dev *dev)
 {
 	u16 cmd;
 
+#ifndef CONFIG_XEN
 	xen_pcibk_control_isr(dev, 1 /* reset device */);
+#endif
 
 	/* Disable devices (but not bridges) */
 	if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
@@ -117,6 +125,9 @@ void xen_pcibk_reset_device(struct pci_dev *dev)
 
 		pci_write_config_word(dev, PCI_COMMAND, 0);
 
+#ifdef CONFIG_XEN
+		atomic_set(&dev->enable_cnt, 0);
+#endif
 		dev->is_busmaster = 0;
 	} else {
 		pci_read_config_word(dev, PCI_COMMAND, &cmd);
@@ -134,7 +145,9 @@ static
 int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
 			 struct pci_dev *dev, struct xen_pci_op *op)
 {
+#ifndef CONFIG_XEN
 	struct xen_pcibk_dev_data *dev_data;
+#endif
 	int otherend = pdev->xdev->otherend_id;
 	int status;
 
@@ -153,14 +166,20 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
 	/* The value the guest needs is actually the IDT vector, not the
 	 * the local domain's IRQ number. */
 
+#ifndef CONFIG_XEN
 	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+#else
+	op->value = dev->irq;
+#endif
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
 			op->value);
 
+#ifndef CONFIG_XEN
 	dev_data = pci_get_drvdata(dev);
 	if (dev_data)
 		dev_data->ack_intr = 0;
+#endif
 
 	return 0;
 }
@@ -169,20 +188,28 @@ static
 int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev,
 			  struct pci_dev *dev, struct xen_pci_op *op)
 {
+#ifndef CONFIG_XEN
 	struct xen_pcibk_dev_data *dev_data;
+#endif
 
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n",
 		       pci_name(dev));
 	pci_disable_msi(dev);
 
+#ifndef CONFIG_XEN
 	op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+#else
+	op->value = dev->irq;
+#endif
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
 			op->value);
+#ifndef CONFIG_XEN
 	dev_data = pci_get_drvdata(dev);
 	if (dev_data)
 		dev_data->ack_intr = 1;
+#endif
 	return 0;
 }
 
@@ -190,7 +217,9 @@ static
 int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
 			  struct pci_dev *dev, struct xen_pci_op *op)
 {
+#ifndef CONFIG_XEN
 	struct xen_pcibk_dev_data *dev_data;
+#endif
 	int i, result;
 	struct msix_entry *entries;
 
@@ -214,9 +243,13 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
 	if (result == 0) {
 		for (i = 0; i < op->value; i++) {
 			op->msix_entries[i].entry = entries[i].entry;
+#ifndef CONFIG_XEN
 			if (entries[i].vector)
 				op->msix_entries[i].vector =
 					xen_pirq_from_irq(entries[i].vector);
+#else
+			op->msix_entries[i].vector = entries[i].vector;
+#endif
 				if (unlikely(verbose_request))
 					printk(KERN_DEBUG DRV_NAME ": %s: " \
 						"MSI-X[%d]: %d\n",
@@ -230,9 +263,11 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
 	kfree(entries);
 
 	op->value = result;
+#ifndef CONFIG_XEN
 	dev_data = pci_get_drvdata(dev);
 	if (dev_data)
 		dev_data->ack_intr = 0;
+#endif
 
 	return result;
 }
@@ -241,12 +276,16 @@ static
 int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev,
 			   struct pci_dev *dev, struct xen_pci_op *op)
 {
+#ifndef CONFIG_XEN
 	struct xen_pcibk_dev_data *dev_data;
+#endif
+
 	if (unlikely(verbose_request))
 		printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n",
 			pci_name(dev));
 	pci_disable_msix(dev);
 
+#ifndef CONFIG_XEN
 	/*
 	 * SR-IOV devices (which don't have any legacy IRQ) have
 	 * an undefined IRQ value of zero.
@@ -258,6 +297,9 @@ int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev,
 	dev_data = pci_get_drvdata(dev);
 	if (dev_data)
 		dev_data->ack_intr = 1;
+#else
+	op->value = dev->irq;
+#endif
 	return 0;
 }
 #endif
@@ -302,9 +344,14 @@ void xen_pcibk_do_op(struct work_struct *data)
 	if (dev == NULL)
 		op->err = XEN_PCI_ERR_dev_not_found;
 	else {
+#ifndef CONFIG_XEN
 		dev_data = pci_get_drvdata(dev);
 		if (dev_data)
 			test_intx = dev_data->enable_intx;
+#else
+		(void)dev_data;
+		(void)test_intx;
+#endif
 		switch (op->cmd) {
 		case XEN_PCI_OP_conf_read:
 			op->err = xen_pcibk_config_read(dev,
@@ -333,11 +380,13 @@ void xen_pcibk_do_op(struct work_struct *data)
 			break;
 		}
 	}
+#ifndef CONFIG_XEN
 	if (!op->err && dev && dev_data) {
 		/* Transition detected */
 		if ((dev_data->enable_intx != test_intx))
 			xen_pcibk_control_isr(dev, 0 /* no reset */);
 	}
+#endif
 	/* Tell the driver domain that we're done. */
 	wmb();
 	clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
@@ -362,6 +411,8 @@ irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id)
 
 	return IRQ_HANDLED;
 }
+
+#ifndef CONFIG_XEN
 static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id)
 {
 	struct pci_dev *dev = (struct pci_dev *)dev_id;
@@ -381,3 +432,4 @@ static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id)
 	}
 	return IRQ_NONE;
 }
+#endif
diff --git a/drivers/xen/xen-pciback/slot.c b/drivers/xen/xen-pciback/slot.c
new file mode 100644
index 0000000..a9d876f
--- /dev/null
+++ b/drivers/xen/xen-pciback/slot.c
@@ -0,0 +1,200 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ *               to the frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
+ *   Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
+ */
+
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+/* There are at most 32 slots in a pci bus.  */
+#define PCI_SLOT_MAX 32
+
+#define PCI_BUS_NBR 2
+
+struct slot_dev_data {
+	/* Access to dev_list must be protected by lock */
+	struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
+	spinlock_t lock;
+};
+
+static struct pci_dev *_xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+					      unsigned int domain,
+					      unsigned int bus,
+					      unsigned int devfn)
+{
+	struct pci_dev *dev = NULL;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	unsigned long flags;
+
+	if (domain != 0 || PCI_FUNC(devfn) != 0)
+		return NULL;
+
+	if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
+		return NULL;
+
+	spin_lock_irqsave(&slot_dev->lock, flags);
+	dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
+	spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+	return dev;
+}
+
+static int _xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+				  struct pci_dev *dev, int devid,
+				  publish_pci_dev_cb publish_cb)
+{
+	int err = 0, slot, bus;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	unsigned long flags;
+
+	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+		err = -EFAULT;
+		xenbus_dev_fatal(pdev->xdev, err,
+				 "Can't export bridges on the virtual PCI bus");
+		goto out;
+	}
+
+	spin_lock_irqsave(&slot_dev->lock, flags);
+
+	/* Assign to a new slot on the virtual PCI bus */
+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			if (slot_dev->slots[bus][slot] == NULL) {
+				pr_info("pciback: slot: %s: assign to"
+					" virtual slot %d, bus %d\n",
+					pci_name(dev), slot, bus);
+				slot_dev->slots[bus][slot] = dev;
+				goto unlock;
+			}
+		}
+
+	err = -ENOMEM;
+	xenbus_dev_fatal(pdev->xdev, err,
+			 "No more space on root virtual PCI bus");
+
+      unlock:
+	spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+	/* Publish this device. */
+	if(!err)
+		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
+
+      out:
+	return err;
+}
+
+static void _xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+				       struct pci_dev *dev)
+{
+	int slot, bus;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	struct pci_dev *found_dev = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&slot_dev->lock, flags);
+
+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			if (slot_dev->slots[bus][slot] == dev) {
+				slot_dev->slots[bus][slot] = NULL;
+				found_dev = dev;
+				goto out;
+			}
+		}
+
+      out:
+	spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+	if (found_dev)
+		pcistub_put_pci_dev(found_dev);
+}
+
+static int _xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+	int slot, bus;
+	struct slot_dev_data *slot_dev;
+
+	slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
+	if (!slot_dev)
+		return -ENOMEM;
+
+	spin_lock_init(&slot_dev->lock);
+
+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+			slot_dev->slots[bus][slot] = NULL;
+
+	pdev->pci_dev_data = slot_dev;
+
+	return 0;
+}
+
+static int _xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+					publish_pci_root_cb publish_cb)
+{
+	/* The Virtual PCI bus has only one root */
+	return publish_cb(pdev, 0, 0);
+}
+
+static void _xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+	int slot, bus;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	struct pci_dev *dev;
+
+	for (bus = 0; bus < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			dev = slot_dev->slots[bus][slot];
+			if (dev != NULL)
+				pcistub_put_pci_dev(dev);
+		}
+
+	kfree(slot_dev);
+	pdev->pci_dev_data = NULL;
+}
+
+static int _xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+				       struct xen_pcibk_device *pdev,
+				       unsigned int *domain,
+				       unsigned int *bus, unsigned int *devfn)
+{
+	int slot, busnr;
+	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+	struct pci_dev *dev;
+	int found = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&slot_dev->lock, flags);
+
+	for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
+		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+			dev = slot_dev->slots[busnr][slot];
+			if (dev && dev->bus->number == pcidev->bus->number
+				&& dev->devfn == pcidev->devfn
+				&& pci_domain_nr(dev->bus) == pci_domain_nr(pcidev->bus)) {
+				found = 1;
+				*domain = 0;
+				*bus = busnr;
+				*devfn = PCI_DEVFN(slot,0);
+				goto out;
+			}
+		}
+out:
+	spin_unlock_irqrestore(&slot_dev->lock, flags);
+	return found;
+
+}
+
+const struct xen_pcibk_backend xen_pcibk_slot_backend = {
+	.name		= "slot",
+	.init		= _xen_pcibk_init_devices,
+	.free		= _xen_pcibk_release_devices,
+	.find		= _xen_pcibk_get_pcifront_dev,
+	.publish	= _xen_pcibk_publish_pci_roots,
+	.release	= _xen_pcibk_release_pci_dev,
+	.add		= _xen_pcibk_add_pci_dev,
+	.get		= _xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index 64b11f9..a70a674 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -6,30 +6,77 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/list.h>
-#include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <xen/xenbus.h>
+#ifndef CONFIG_XEN
 #include <xen/events.h>
 #include <asm/xen/pci.h>
+#else
+#include <xen/evtchn.h>
+#endif
 #include "pciback.h"
 
 #define INVALID_EVTCHN_IRQ  (-1)
 struct workqueue_struct *xen_pcibk_wq;
 
-static bool __read_mostly passthrough;
-module_param(passthrough, bool, S_IRUGO);
-MODULE_PARM_DESC(passthrough,
-	"Option to specify how to export PCI topology to guest:\n"\
-	" 0 - (default) Hide the true PCI topology and makes the frontend\n"\
-	"   there is a single PCI bus with only the exported devices on it.\n"\
-	"   For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"\
-	"   while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"\
-	" 1 - Passthrough provides a real view of the PCI topology to the\n"\
-	"   frontend (for example, a device at 06:01.b will still appear at\n"\
-	"   06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\
-	"   exposed PCI devices to its driver domains. This may be required\n"\
-	"   for drivers which depend on finding their hardward in certain\n"\
-	"   bus/slot locations.");
+static char __read_mostly mode[16] = CONFIG_XEN_PCIDEV_BACKEND_DEFAULT;
+module_param_string(mode, mode, sizeof(mode), S_IRUGO);
+MODULE_PARM_DESC(mode,
+	"Option to specify how to export PCI topology to guest:\n"
+#ifdef CONFIG_XEN_PCIDEV_BACKEND_VPCI
+	" vpci"
+# ifdef CONFIG_XEN_PCIDEV_BACKEND_DEFAULT_VPCI
+	" (default)"
+# endif
+	"\n"
+	"   Hides the true PCI topology and makes the frontend think there\n"
+	"   is a single PCI bus with only the exported devices on it.\n"
+	"   For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"
+	"   while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"
+#endif
+#ifdef CONFIG_XEN_PCIDEV_BACKEND_PASSTHROUGH
+	" passthrough"
+# ifdef CONFIG_XEN_PCIDEV_BACKEND_DEFAULT_PASSTHROUGH
+	" (default)"
+# endif
+	"\n"
+	"   Passthrough provides a real view of the PCI topology to the\n"
+	"   frontend (for example, a device at 06:01.b will still appear at\n"
+	"   06:01.b to the frontend). This is similar to how Xen 2.0.x\n"
+	"   exposed PCI devices to its driver domains. This may be required\n"
+	"   for drivers which depend on finding their hardware in certain\n"
+	"   bus/slot locations.\n"
+#endif
+#ifdef CONFIG_XEN_PCIDEV_BACKEND_SLOT
+	" slot\n"
+# ifdef CONFIG_XEN_PCIDEV_BACKEND_DEFAULT_SLOT
+	" (default)"
+# endif
+	"   Hides the true PCI topology and makes the frontend think there\n"
+	"   is a single PCI bus with only the exported devices on it.\n"
+	"   Contrary to the virtual PCI backend, each function becomes a\n"
+	"   new slot.\n"
+	"   For example, a device at 03:05.2 will be re-assigned to 00:00.0.\n"
+	"   A second device at 02:1a.1 will be re-assigned to 00:01.0.\n"
+#endif
+#ifdef CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER
+	" controller\n"
+# ifdef CONFIG_XEN_PCIDEV_BACKEND_DEFAULT_CONTROLLER
+	" (default)"
+# endif
+	"   Virtualizes the PCI bus topology by providing a virtual bus\n"
+	"   per PCI root device.  Devices which are physically under\n"
+	"   the same root bus will appear on the same virtual bus.  For\n"
+	"   systems with complex I/O addressing, this is the only backend\n"
+	"   which supports extended I/O port spaces and MMIO translation\n"
+	"   offsets.  This backend also supports slot virtualization.\n"
+	"   For example, a device at 0000:01:02.1 will be re-assigned to\n"
+	"   0000:00:00.0.  A second device at 0000:02:05.0 (behind a P2P\n"
+	"   bridge on bus 0000:01) will be re-assigned to 0000:00:01.0.  A\n"
+	"   third device at 0000:16:05.0 (under a different PCI root bus)\n"
+	"   will be re-assigned to 0000:01:00.0.\n"
+#endif
+	);
 
 static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
 {
@@ -45,6 +92,9 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
 
 	mutex_init(&pdev->dev_lock);
 
+#ifdef CONFIG_XEN
+	pdev->sh_area = NULL;
+#endif
 	pdev->sh_info = NULL;
 	pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
 	pdev->be_watching = 0;
@@ -75,7 +125,11 @@ static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev)
 	flush_workqueue(xen_pcibk_wq);
 
 	if (pdev->sh_info != NULL) {
+#ifndef CONFIG_XEN
 		xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
+#else
+		xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area);
+#endif
 		pdev->sh_info = NULL;
 	}
 	mutex_unlock(&pdev->dev_lock);
@@ -102,20 +156,35 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
 			     int remote_evtchn)
 {
 	int err = 0;
+#ifndef CONFIG_XEN
 	void *vaddr;
+#else
+	struct vm_struct *area;
+#endif
 
 	dev_dbg(&pdev->xdev->dev,
 		"Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
 		gnt_ref, remote_evtchn);
 
+#ifndef CONFIG_XEN
 	err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
 	if (err < 0) {
+#else
+	area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref);
+	if (IS_ERR(area)) {
+		err = PTR_ERR(area);
+#endif
 		xenbus_dev_fatal(pdev->xdev, err,
 				"Error mapping other domain page in ours.");
 		goto out;
 	}
 
+#ifndef CONFIG_XEN
 	pdev->sh_info = vaddr;
+#else
+	pdev->sh_area = area;
+	pdev->sh_info = area->addr;
+#endif
 
 	err = bind_interdomain_evtchn_to_irqhandler(
 		pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event,
@@ -241,6 +310,7 @@ static int xen_pcibk_export_device(struct xen_pcibk_device *pdev,
 	if (err)
 		goto out;
 
+#ifndef CONFIG_XEN
 	dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
 	if (xen_register_device_domain_owner(dev,
 					     pdev->xdev->otherend_id) != 0) {
@@ -249,6 +319,7 @@ static int xen_pcibk_export_device(struct xen_pcibk_device *pdev,
 		xen_unregister_device_domain_owner(dev);
 		xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
 	}
+#endif
 
 	/* TODO: It'd be nice to export a bridge and have all of its children
 	 * get exported with it. This may be best done in xend (which will
@@ -280,8 +351,10 @@ static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev,
 		goto out;
 	}
 
+#ifndef CONFIG_XEN
 	dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
 	xen_unregister_device_domain_owner(dev);
+#endif
 
 	xen_pcibk_release_pci_dev(pdev, dev);
 
@@ -718,18 +791,31 @@ static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME,
 );
 
 const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend;
+static const struct xen_pcibk_backend *__initdata xen_pcibk_backends[] = {
+	&xen_pcibk_vpci_backend,
+	&xen_pcibk_passthrough_backend,
+	&xen_pcibk_slot_backend,
+	&xen_pcibk_controller_backend,
+};
 
 int __init xen_pcibk_xenbus_register(void)
 {
+	unsigned int i;
+
 	xen_pcibk_wq = create_workqueue("xen_pciback_workqueue");
 	if (!xen_pcibk_wq) {
 		printk(KERN_ERR "%s: create"
 			"xen_pciback_workqueue failed\n", __func__);
 		return -EFAULT;
 	}
-	xen_pcibk_backend = &xen_pcibk_vpci_backend;
-	if (passthrough)
-		xen_pcibk_backend = &xen_pcibk_passthrough_backend;
+	for (i = 0; i < ARRAY_SIZE(xen_pcibk_backends); ++i) {
+		if (!xen_pcibk_backends[i])
+			continue;
+		if (strcmp(xen_pcibk_backends[i]->name, mode) == 0) {
+			xen_pcibk_backend = xen_pcibk_backends[i];
+			break;
+		}
+	}
 	pr_info(DRV_NAME ": backend is %s\n", xen_pcibk_backend->name);
 	return xenbus_register_backend(&xen_pcibk_driver);
 }
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c
index 767ff65..0f5171d 100644
--- a/drivers/xen/xen-selfballoon.c
+++ b/drivers/xen/xen-selfballoon.c
@@ -79,6 +79,10 @@
 #include <xen/tmem.h>
 #include <xen/xen.h>
 
+#ifdef CONFIG_XEN
+#include "balloon/common.h"
+#endif
+
 /* Enable/disable with sysfs. */
 static int xen_selfballooning_enabled __read_mostly;
 
@@ -503,7 +507,6 @@ int register_xen_selfballooning(struct device *dev)
 #endif
 	return error;
 }
-EXPORT_SYMBOL(register_xen_selfballooning);
 
 static int __init xen_selfballoon_init(void)
 {
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
index 31e2e90..a1b4dcc 100644
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -1,14 +1,17 @@
-obj-y	+= xenbus.o
-obj-y	+= xenbus_dev_frontend.o
+obj-y += xenbus_client.o xenbus_comms.o xenbus_xs.o xenbus_probe.o
+backend-standalone-$(CONFIG_XEN) += xenbus_be.o
+obj-$(CONFIG_PARAVIRT_XEN) += xenbus_dev_frontend.o
 
-xenbus-objs =
-xenbus-objs += xenbus_client.o
-xenbus-objs += xenbus_comms.o
-xenbus-objs += xenbus_xs.o
-xenbus-objs += xenbus_probe.o
+xenbus_be-objs =
+xenbus_be-objs += xenbus_backend_client.o
+xenbus_be-objs += xenbus_dev_backend.o
 
-xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
-xenbus-objs += $(xenbus-be-objs-y)
+xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
+obj-y += $(xenbus-y) $(xenbus-m)
+obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o
 
-obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o
+obj-$(CONFIG_PARAVIRT_XEN_BACKEND) += xenbus_probe_backend.o
+backend-standalone-$(CONFIG_PARAVIRT_XEN) += xenbus_dev_backend.o
 obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
+
+obj-$(CONFIG_XEN_BACKEND) += $(backend-standalone-y)
diff --git a/drivers/xen/xenbus/xenbus_backend_client.c b/drivers/xen/xenbus/xenbus_backend_client.c
new file mode 100644
index 0000000..40bbe19
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_backend_client.c
@@ -0,0 +1,106 @@
+/******************************************************************************
+ * Backend-client-facing interface for the Xenbus driver.  In other words, the
+ * interface between the Xenbus and the device-specific code in the backend
+ * driver.
+ *
+ * Copyright (C) 2005-2006 XenSource Ltd
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <xen/gnttab.h>
+#include <xen/xenbus.h>
+
+/* Based on Rusty Russell's skeleton driver's map_page */
+struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t gnt_ref)
+{
+	struct gnttab_map_grant_ref op;
+	struct vm_struct *area;
+
+	area = alloc_vm_area(PAGE_SIZE, NULL);
+	if (!area)
+		return ERR_PTR(-ENOMEM);
+
+	gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
+			  gnt_ref, dev->otherend_id);
+	
+	gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &op);
+
+	if (op.status != GNTST_okay) {
+		free_vm_area(area);
+		xenbus_dev_fatal(dev, op.status,
+				 "mapping in shared page %d from domain %d",
+				 gnt_ref, dev->otherend_id);
+		BUG_ON(!IS_ERR(ERR_PTR(op.status)));
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* Stuff the handle in an unused field */
+	area->phys_addr = (unsigned long)op.handle;
+
+	return area;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+
+/* Based on Rusty Russell's skeleton driver's unmap_page */
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area)
+{
+	struct gnttab_unmap_grant_ref op;
+
+	gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
+			    (grant_handle_t)area->phys_addr);
+
+	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+		BUG();
+
+	if (op.status == GNTST_okay)
+		free_vm_area(area);
+	else
+		xenbus_dev_error(dev, op.status,
+				 "unmapping page at handle %d error %d",
+				 (int16_t)area->phys_addr, op.status);
+
+	return op.status == GNTST_okay ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+
+
+int xenbus_dev_is_online(struct xenbus_device *dev)
+{
+	int rc, val;
+
+	rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
+	if (rc != 1)
+		val = 0; /* no online node present */
+
+	return val;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 566d2ad..64dcfad 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -31,6 +31,10 @@
  */
 
 #include <linux/slab.h>
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#else
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
@@ -42,9 +46,11 @@
 #include <xen/balloon.h>
 #include <xen/events.h>
 #include <xen/grant_table.h>
+#endif
 #include <xen/xenbus.h>
 #include <xen/xen.h>
 
+#if defined(CONFIG_PARAVIRT_XEN)
 #include "xenbus_probe.h"
 
 struct xenbus_map_node {
@@ -60,11 +66,14 @@ static DEFINE_SPINLOCK(xenbus_valloc_lock);
 static LIST_HEAD(xenbus_valloc_pages);
 
 struct xenbus_ring_ops {
-	int (*map)(struct xenbus_device *dev, int gnt, void **vaddr);
+	int (*map)(struct xenbus_device *dev, grant_ref_t gnt, void **vaddr);
 	int (*unmap)(struct xenbus_device *dev, void *vaddr);
 };
 
 static const struct xenbus_ring_ops *ring_ops __read_mostly;
+#elif defined(HAVE_XEN_PLATFORM_COMPAT_H)
+#include <xen/platform-compat.h>
+#endif
 
 const char *xenbus_strstate(enum xenbus_state state)
 {
@@ -75,9 +84,9 @@ const char *xenbus_strstate(enum xenbus_state state)
 		[ XenbusStateInitialised  ] = "Initialised",
 		[ XenbusStateConnected    ] = "Connected",
 		[ XenbusStateClosing      ] = "Closing",
-		[ XenbusStateClosed	  ] = "Closed",
-		[XenbusStateReconfiguring] = "Reconfiguring",
-		[XenbusStateReconfigured] = "Reconfigured",
+		[ XenbusStateClosed       ] = "Closed",
+		[ XenbusStateReconfiguring ] = "Reconfiguring",
+		[ XenbusStateReconfigured ] = "Reconfigured",
 	};
 	return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
 }
@@ -120,6 +129,26 @@ int xenbus_watch_path(struct xenbus_device *dev, const char *path,
 EXPORT_SYMBOL_GPL(xenbus_watch_path);
 
 
+#if defined(CONFIG_XEN) || defined(MODULE)
+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
+		       const char *path2, struct xenbus_watch *watch,
+		       void (*callback)(struct xenbus_watch *,
+					const char **, unsigned int))
+{
+	int err;
+	char *state = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", path, path2);
+	if (!state) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
+		return -ENOMEM;
+	}
+	err = xenbus_watch_path(dev, state, watch, callback);
+
+	if (err)
+		kfree(state);
+	return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_path2);
+#else
 /**
  * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
  * @dev: xenbus device
@@ -160,6 +189,7 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev,
 	return err;
 }
 EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+#endif
 
 static void xenbus_switch_fatal(struct xenbus_device *, int, int,
 				const char *, ...);
@@ -233,7 +263,6 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
 {
 	return __xenbus_switch_state(dev, state, 0);
 }
-
 EXPORT_SYMBOL_GPL(xenbus_switch_state);
 
 int xenbus_frontend_closed(struct xenbus_device *dev)
@@ -254,41 +283,23 @@ static char *error_path(struct xenbus_device *dev)
 }
 
 
-static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
-				const char *fmt, va_list ap)
+static void _dev_error(struct xenbus_device *dev, int err,
+			const char *fmt, va_list *ap)
 {
-	int ret;
-	unsigned int len;
-	char *printf_buffer = NULL;
-	char *path_buffer = NULL;
-
-#define PRINTF_BUFFER_SIZE 4096
-	printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
-	if (printf_buffer == NULL)
-		goto fail;
-
-	len = sprintf(printf_buffer, "%i ", -err);
-	ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
+	char *printf_buffer, *path_buffer;
+	struct va_format vaf = { .fmt = fmt, .va = ap };
 
-	BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
-
-	dev_err(&dev->dev, "%s\n", printf_buffer);
+	printf_buffer = kasprintf(GFP_KERNEL, "%i %pV", -err, &vaf);
+	if (printf_buffer)
+		dev_err(&dev->dev, "%s\n", printf_buffer);
 
 	path_buffer = error_path(dev);
+	if (!printf_buffer || !path_buffer
+	    || xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer))
+		dev_err(&dev->dev,
+		        "xenbus: failed to write error node for %s (%s)\n",
+		        dev->nodename, printf_buffer);
 
-	if (path_buffer == NULL) {
-		dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
-		       dev->nodename, printf_buffer);
-		goto fail;
-	}
-
-	if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
-		dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
-		       dev->nodename, printf_buffer);
-		goto fail;
-	}
-
-fail:
 	kfree(printf_buffer);
 	kfree(path_buffer);
 }
@@ -308,11 +319,12 @@ void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...)
 	va_list ap;
 
 	va_start(ap, fmt);
-	xenbus_va_dev_error(dev, err, fmt, ap);
+	_dev_error(dev, err, fmt, &ap);
 	va_end(ap);
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_error);
 
+
 /**
  * xenbus_dev_fatal
  * @dev: xenbus device
@@ -323,13 +335,12 @@ EXPORT_SYMBOL_GPL(xenbus_dev_error);
  * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly
  * closedown of this driver and its peer.
  */
-
 void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
-	xenbus_va_dev_error(dev, err, fmt, ap);
+	_dev_error(dev, err, fmt, &ap);
 	va_end(ap);
 
 	xenbus_switch_state(dev, XenbusStateClosing);
@@ -346,7 +357,7 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
 	va_list ap;
 
 	va_start(ap, fmt);
-	xenbus_va_dev_error(dev, err, fmt, ap);
+	_dev_error(dev, err, fmt, &ap);
 	va_end(ap);
 
 	if (!depth)
@@ -357,7 +368,7 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
  * xenbus_grant_ring
  * @dev: xenbus device
  * @ring_mfn: mfn of ring to grant
-
+ *
  * Grant access to the given @ring_mfn to the peer of the given device.  Return
  * 0 on success, or -errno on error.  On error, the device will switch to
  * XenbusStateClosing, and the error will be saved in the store.
@@ -383,7 +394,7 @@ int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
 	struct evtchn_alloc_unbound alloc_unbound;
 	int err;
 
-	alloc_unbound.dom = DOMID_SELF;
+	alloc_unbound.dom        = DOMID_SELF;
 	alloc_unbound.remote_dom = dev->otherend_id;
 
 	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
@@ -398,6 +409,7 @@ int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
 EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
 
 
+#if 0 /* !defined(CONFIG_XEN) && !defined(MODULE) */
 /**
  * Bind to an existing interdomain event channel in another domain. Returns 0
  * on success and stores the local port in *port. On error, returns -errno,
@@ -423,6 +435,7 @@ int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
 	return err;
 }
 EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
+#endif
 
 
 /**
@@ -444,6 +457,7 @@ int xenbus_free_evtchn(struct xenbus_device *dev, int port)
 EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
 
 
+#if !defined(CONFIG_XEN) && !defined(MODULE)
 /**
  * xenbus_map_ring_valloc
  * @dev: xenbus device
@@ -458,14 +472,14 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
  * or -ENOMEM on error. If an error is returned, device will switch to
  * XenbusStateClosing and the error message will be saved in XenStore.
  */
-int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
+int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t gnt_ref, void **vaddr)
 {
 	return ring_ops->map(dev, gnt_ref, vaddr);
 }
 EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
 
 static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
-				     int gnt_ref, void **vaddr)
+				     grant_ref_t gnt_ref, void **vaddr)
 {
 	struct gnttab_map_grant_ref op = {
 		.flags = GNTMAP_host_map | GNTMAP_contains_pte,
@@ -514,7 +528,7 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
 }
 
 static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
-				      int gnt_ref, void **vaddr)
+				      grant_ref_t gnt_ref, void **vaddr)
 {
 	struct xenbus_map_node *node;
 	int err;
@@ -564,7 +578,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
  * or -ENOMEM on error. If an error is returned, device will switch to
  * XenbusStateClosing and the error message will be saved in XenStore.
  */
-int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t gnt_ref,
 		    grant_handle_t *handle, void *vaddr)
 {
 	struct gnttab_map_grant_ref op;
@@ -711,6 +725,7 @@ int xenbus_unmap_ring(struct xenbus_device *dev,
 	return op.status;
 }
 EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
+#endif
 
 
 /**
@@ -722,15 +737,16 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
  */
 enum xenbus_state xenbus_read_driver_state(const char *path)
 {
-	enum xenbus_state result;
-	int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
-	if (err)
+	int result;
+
+	if (xenbus_scanf(XBT_NIL, path, "state", "%d", &result) != 1)
 		result = XenbusStateUnknown;
 
 	return result;
 }
 EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
 
+#if !defined(CONFIG_XEN) && !defined(MODULE)
 static const struct xenbus_ring_ops ring_ops_pv = {
 	.map = xenbus_map_ring_valloc_pv,
 	.unmap = xenbus_unmap_ring_vfree_pv,
@@ -748,3 +764,4 @@ void __init xenbus_ring_ops_init(void)
 	else
 		ring_ops = &ring_ops_hvm;
 }
+#endif
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
index 2eff7a6..721fd08 100644
--- a/drivers/xen/xenbus/xenbus_comms.c
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -35,24 +35,57 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <xen/xenbus.h>
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#else
 #include <asm/xen/hypervisor.h>
 #include <xen/events.h>
 #include <xen/page.h>
+#endif
+
 #include "xenbus_comms.h"
 
-static int xenbus_irq;
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
 
-static DECLARE_WORK(probe_work, xenbus_probe);
+static int xenbus_irq;
 
 static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
 
 static irqreturn_t wake_waiting(int irq, void *unused)
 {
-	if (unlikely(xenstored_ready == 0)) {
-		xenstored_ready = 1;
-		schedule_work(&probe_work);
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+	static DECLARE_WORK(probe_work, xenbus_probe);
+	int old, new;
+
+	old = atomic_read(&xenbus_xsd_state);
+	switch (old) {
+		case XENBUS_XSD_UNCOMMITTED:
+			BUG();
+			return IRQ_HANDLED;
+
+		case XENBUS_XSD_FOREIGN_INIT:
+			new = XENBUS_XSD_FOREIGN_READY;
+			break;
+
+		case XENBUS_XSD_LOCAL_INIT:
+			new = XENBUS_XSD_LOCAL_READY;
+			break;
+
+		case XENBUS_XSD_FOREIGN_READY:
+		case XENBUS_XSD_LOCAL_READY:
+		default:
+			goto wake;
 	}
 
+	old = atomic_cmpxchg(&xenbus_xsd_state, old, new);
+	if (old != new)
+		schedule_work(&probe_work);
+
+wake:
+#endif
 	wake_up(&xb_waitq);
 	return IRQ_HANDLED;
 }
@@ -203,34 +236,48 @@ int xb_read(void *data, unsigned len)
 int xb_init_comms(void)
 {
 	struct xenstore_domain_interface *intf = xen_store_interface;
+	int err;
 
 	if (intf->req_prod != intf->req_cons)
-		printk(KERN_ERR "XENBUS request ring is not quiescent "
+		pr_err("XENBUS request ring is not quiescent "
 		       "(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
 
 	if (intf->rsp_prod != intf->rsp_cons) {
-		printk(KERN_WARNING "XENBUS response ring is not quiescent "
-		       "(%08x:%08x): fixing up\n",
-		       intf->rsp_cons, intf->rsp_prod);
+		pr_warning("XENBUS response ring is not quiescent"
+			   " (%08x:%08x): fixing up\n",
+			   intf->rsp_cons, intf->rsp_prod);
 		/* breaks kdump */
 		if (!reset_devices)
 			intf->rsp_cons = intf->rsp_prod;
 	}
 
+#if defined(CONFIG_XEN) || defined(MODULE)
+	if (xenbus_irq)
+		unbind_from_irqhandler(xenbus_irq, &xb_waitq);
+
+	err = bind_caller_port_to_irqhandler(
+		xen_store_evtchn, wake_waiting,
+		0, "xenbus", &xb_waitq);
+	if (err <= 0) {
+		pr_err("XENBUS request irq failed %i\n", err);
+		return err;
+	}
+
+	xenbus_irq = err;
+#else
 	if (xenbus_irq) {
 		/* Already have an irq; assume we're resuming */
 		rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
 	} else {
-		int err;
 		err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
 						0, "xenbus", &xb_waitq);
 		if (err <= 0) {
-			printk(KERN_ERR "XENBUS request irq failed %i\n", err);
+			pr_err("XENBUS request irq failed %i\n", err);
 			return err;
 		}
-
 		xenbus_irq = err;
 	}
+#endif
 
 	return 0;
 }
diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h
index 6e42800..10f775e 100644
--- a/drivers/xen/xenbus/xenbus_comms.h
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -47,4 +47,27 @@ extern int xen_store_evtchn;
 
 extern const struct file_operations xen_xenbus_fops;
 
+/* For xenbus internal use. */
+enum {
+	XENBUS_XSD_UNCOMMITTED = 0,
+	XENBUS_XSD_FOREIGN_INIT,
+	XENBUS_XSD_FOREIGN_READY,
+	XENBUS_XSD_LOCAL_INIT,
+	XENBUS_XSD_LOCAL_READY,
+};
+extern atomic_t xenbus_xsd_state;
+
+static inline int is_xenstored_ready(void)
+{
+	int s = atomic_read(&xenbus_xsd_state);
+	return s == XENBUS_XSD_FOREIGN_READY || s == XENBUS_XSD_LOCAL_READY;
+}
+
+#if defined(CONFIG_XEN_XENBUS_DEV) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
+#include <xen/interface/event_channel.h>
+#include <xen/interface/grant_table.h>
+
+int xenbus_conn(domid_t, grant_ref_t *, evtchn_port_t *);
+#endif
+
 #endif /* _XENBUS_COMMS_H */
diff --git a/drivers/xen/xenbus/xenbus_dev.c b/drivers/xen/xenbus/xenbus_dev.c
new file mode 100644
index 0000000..c183e3e
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev.c
@@ -0,0 +1,511 @@
+/*
+ * xenbus_dev.c
+ * 
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ * 
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+
+#include "xenbus_comms.h"
+
+#include <asm/uaccess.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/xen_proc.h>
+#include <asm/hypervisor.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#include <xen/public/xenbus.h>
+
+struct xenbus_dev_transaction {
+	struct list_head list;
+	struct xenbus_transaction handle;
+};
+
+struct read_buffer {
+	struct list_head list;
+	unsigned int cons;
+	unsigned int len;
+	char msg[];
+};
+
+struct xenbus_dev_data {
+	/* In-progress transaction. */
+	struct list_head transactions;
+
+	/* Active watches. */
+	struct list_head watches;
+
+	/* Partial request. */
+	unsigned int len;
+	union {
+		struct xsd_sockmsg msg;
+		char buffer[XENSTORE_PAYLOAD_MAX];
+	} u;
+
+	/* Response queue. */
+	struct list_head read_buffers;
+	wait_queue_head_t read_waitq;
+
+	struct mutex reply_mutex;
+};
+
+static struct proc_dir_entry *xenbus_dev_intf;
+
+static ssize_t xenbus_dev_read(struct file *filp,
+			       char __user *ubuf,
+			       size_t len, loff_t *ppos)
+{
+	struct xenbus_dev_data *u = filp->private_data;
+	struct read_buffer *rb;
+	int i, ret;
+
+	if (!is_xenstored_ready())
+		return -ENODEV;
+
+	mutex_lock(&u->reply_mutex);
+	while (list_empty(&u->read_buffers)) {
+		mutex_unlock(&u->reply_mutex);
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(u->read_waitq,
+					       !list_empty(&u->read_buffers));
+		if (ret)
+			return ret;
+		mutex_lock(&u->reply_mutex);
+	}
+
+	rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+	for (i = 0; i < len;) {
+		put_user(rb->msg[rb->cons], ubuf + i);
+		i++;
+		rb->cons++;
+		if (rb->cons == rb->len) {
+			list_del(&rb->list);
+			kfree(rb);
+			if (list_empty(&u->read_buffers))
+				break;
+			rb = list_entry(u->read_buffers.next,
+					struct read_buffer, list);
+		}
+	}
+	mutex_unlock(&u->reply_mutex);
+
+	return i;
+}
+
+static int queue_reply(struct list_head *queue,
+		       const void *data, unsigned int len)
+{
+	struct read_buffer *rb;
+
+	if (len == 0)
+		return 0;
+
+	rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+	if (!rb)
+		return -ENOMEM;
+
+	rb->cons = 0;
+	rb->len = len;
+
+	memcpy(rb->msg, data, len);
+
+	list_add_tail(&rb->list, queue);
+	return 0;
+}
+
+static void queue_flush(struct xenbus_dev_data *u, struct list_head *queue,
+			int err)
+{
+	if (!err) {
+		list_splice_tail(queue, &u->read_buffers);
+		wake_up(&u->read_waitq);
+	} else
+		while (!list_empty(queue)) {
+			struct read_buffer *rb = list_entry(queue->next,
+				struct read_buffer, list);
+
+			list_del(queue->next);
+			kfree(rb);
+		}
+}
+
+struct watch_adapter
+{
+	struct list_head list;
+	struct xenbus_watch watch;
+	struct xenbus_dev_data *dev_data;
+	char *token;
+};
+
+static void free_watch_adapter (struct watch_adapter *watch)
+{
+	kfree(watch->watch.node);
+	kfree(watch->token);
+	kfree(watch);
+}
+
+static void watch_fired(struct xenbus_watch *watch,
+			const char **vec,
+			unsigned int len)
+{
+	struct watch_adapter *adap =
+            container_of(watch, struct watch_adapter, watch);
+	struct xsd_sockmsg hdr;
+	const char *path, *token;
+	int err, path_len, tok_len, body_len, data_len = 0;
+	LIST_HEAD(queue);
+
+	path = vec[XS_WATCH_PATH];
+	token = adap->token;
+
+	path_len = strlen(path) + 1;
+	tok_len = strlen(token) + 1;
+	if (len > 2)
+		data_len = vec[len] - vec[2] + 1;
+	body_len = path_len + tok_len + data_len;
+
+	hdr.type = XS_WATCH_EVENT;
+	hdr.len = body_len;
+
+	mutex_lock(&adap->dev_data->reply_mutex);
+	err = queue_reply(&queue, &hdr, sizeof(hdr));
+	if (!err)
+		err = queue_reply(&queue, path, path_len);
+	if (!err)
+		err = queue_reply(&queue, token, tok_len);
+	if (!err && len > 2)
+		err = queue_reply(&queue, vec[2], data_len);
+	queue_flush(adap->dev_data, &queue, err);
+	mutex_unlock(&adap->dev_data->reply_mutex);
+}
+
+static LIST_HEAD(watch_list);
+
+static ssize_t xenbus_dev_write(struct file *filp,
+				const char __user *ubuf,
+				size_t len, loff_t *ppos)
+{
+	struct xenbus_dev_data *u = filp->private_data;
+	struct xenbus_dev_transaction *trans = NULL;
+	uint32_t msg_type;
+	void *reply = NULL;
+	LIST_HEAD(queue);
+	char *path, *token;
+	struct watch_adapter *watch;
+	int err, rc = len;
+
+	if (!is_xenstored_ready())
+		return -ENODEV;
+
+	if ((len + u->len) > sizeof(u->u.buffer)) {
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	u->len += len;
+	if ((u->len < sizeof(u->u.msg)) ||
+	    (u->len < (sizeof(u->u.msg) + u->u.msg.len)))
+		return rc;
+
+	msg_type = u->u.msg.type;
+
+	switch (msg_type) {
+	case XS_WATCH:
+	case XS_UNWATCH: {
+		static const char XS_RESP[] = "OK";
+		struct xsd_sockmsg hdr;
+
+		path = u->u.buffer + sizeof(u->u.msg);
+		token = memchr(path, 0, u->u.msg.len);
+		if (token == NULL) {
+			rc = -EILSEQ;
+			goto out;
+		}
+		token++;
+		if (memchr(token, 0, u->u.msg.len - (token - path)) == NULL) {
+			rc = -EILSEQ;
+			goto out;
+		}
+
+		if (msg_type == XS_WATCH) {
+			watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+			if (watch == NULL) {
+				rc = -ENOMEM;
+				goto out;
+			}
+			watch->watch.node = kstrdup(path, GFP_KERNEL);
+			watch->watch.callback = watch_fired;
+			watch->token = kstrdup(token, GFP_KERNEL);
+			watch->dev_data = u;
+
+			err = watch->watch.node && watch->token
+			      ? register_xenbus_watch(&watch->watch) : -ENOMEM;
+			if (err) {
+				free_watch_adapter(watch);
+				rc = err;
+				goto out;
+			}
+			
+			list_add(&watch->list, &u->watches);
+		} else {
+			list_for_each_entry(watch, &u->watches, list) {
+				if (!strcmp(watch->token, token) &&
+				    !strcmp(watch->watch.node, path))
+				{
+					unregister_xenbus_watch(&watch->watch);
+					list_del(&watch->list);
+					free_watch_adapter(watch);
+					break;
+				}
+			}
+		}
+
+		hdr.type = msg_type;
+		hdr.len = sizeof(XS_RESP);
+		mutex_lock(&u->reply_mutex);
+		err = queue_reply(&queue, &hdr, sizeof(hdr))
+		      ?: queue_reply(&queue, XS_RESP, hdr.len);
+		break;
+	}
+
+	case XS_TRANSACTION_START:
+		trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+		if (!trans) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		goto common;
+
+	case XS_TRANSACTION_END:
+		list_for_each_entry(trans, &u->transactions, list)
+			if (trans->handle.id == u->u.msg.tx_id)
+				break;
+		if (&trans->list == &u->transactions) {
+			rc = -ESRCH;
+			goto out;
+		}
+		/* fall through */
+	common:
+	default:
+		reply = xenbus_dev_request_and_reply(&u->u.msg);
+		if (IS_ERR(reply)) {
+			if (msg_type == XS_TRANSACTION_START)
+				kfree(trans);
+			rc = PTR_ERR(reply);
+			goto out;
+		}
+
+		if (msg_type == XS_TRANSACTION_START) {
+			trans->handle.id = simple_strtoul(reply, NULL, 0);
+			list_add(&trans->list, &u->transactions);
+		} else if (msg_type == XS_TRANSACTION_END) {
+			list_del(&trans->list);
+			kfree(trans);
+		}
+		mutex_lock(&u->reply_mutex);
+		err = queue_reply(&queue, &u->u.msg, sizeof(u->u.msg))
+		      ?: queue_reply(&queue, reply, u->u.msg.len);
+		break;
+	}
+
+	queue_flush(u, &queue, err);
+	mutex_unlock(&u->reply_mutex);
+	kfree(reply);
+	if (err)
+		rc = err;
+
+ out:
+	u->len = 0;
+	return rc;
+}
+
+static int xenbus_dev_open(struct inode *inode, struct file *filp)
+{
+	struct xenbus_dev_data *u;
+
+	if (xen_store_evtchn == 0)
+		return -ENOENT;
+
+	nonseekable_open(inode, filp);
+
+	u = kzalloc(sizeof(*u), GFP_KERNEL);
+	if (u == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&u->transactions);
+	INIT_LIST_HEAD(&u->watches);
+	INIT_LIST_HEAD(&u->read_buffers);
+	init_waitqueue_head(&u->read_waitq);
+
+	mutex_init(&u->reply_mutex);
+
+	filp->private_data = u;
+
+	return 0;
+}
+
+static int xenbus_dev_release(struct inode *inode, struct file *filp)
+{
+	struct xenbus_dev_data *u = filp->private_data;
+	struct xenbus_dev_transaction *trans, *tmp;
+	struct watch_adapter *watch, *tmp_watch;
+	struct read_buffer *rb, *tmp_rb;
+
+	list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+		xenbus_transaction_end(trans->handle, 1);
+		list_del(&trans->list);
+		kfree(trans);
+	}
+
+	list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+		unregister_xenbus_watch(&watch->watch);
+		list_del(&watch->list);
+		free_watch_adapter(watch);
+	}
+
+	list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
+		list_del(&rb->list);
+		kfree(rb);
+	}
+	kfree(u);
+
+	return 0;
+}
+
+static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
+{
+	struct xenbus_dev_data *u = file->private_data;
+
+	if (!is_xenstored_ready())
+		return -ENODEV;
+
+	poll_wait(file, &u->read_waitq, wait);
+	if (!list_empty(&u->read_buffers))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static long xenbus_dev_ioctl(struct file *file,
+                             unsigned int cmd, unsigned long data)
+{
+	void __user *udata = (void __user *) data;
+	int ret = -ENOTTY;
+	
+	if (!is_initial_xendomain())
+		return -ENODEV;
+
+
+	switch (cmd) {
+	case IOCTL_XENBUS_ALLOC: {
+		xenbus_alloc_t xa;
+		int old;
+
+		old = atomic_cmpxchg(&xenbus_xsd_state,
+		                     XENBUS_XSD_UNCOMMITTED,
+		                     XENBUS_XSD_FOREIGN_INIT);
+		if (old != XENBUS_XSD_UNCOMMITTED)
+			return -EBUSY;
+
+		if (copy_from_user(&xa, udata, sizeof(xa))) {
+			ret = -EFAULT;
+			atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED);
+			break;
+		}
+
+		ret = xenbus_conn(xa.dom, &xa.grant_ref, &xa.port);
+		if (ret != 0) {
+			atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED);
+			break;
+		}
+
+		if (copy_to_user(udata, &xa, sizeof(xa))) {
+			ret = -EFAULT;
+			atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED);
+			break;
+		}
+	}
+	break;
+
+	default:
+		break;
+	}
+
+	return ret;
+}
+#endif
+
+static const struct file_operations xenbus_dev_file_ops = {
+	.read = xenbus_dev_read,
+	.write = xenbus_dev_write,
+	.open = xenbus_dev_open,
+	.release = xenbus_dev_release,
+	.llseek = no_llseek,
+	.poll = xenbus_dev_poll,
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+	.unlocked_ioctl = xenbus_dev_ioctl
+#endif
+};
+
+int
+#ifndef MODULE
+__init
+#else
+__devinit
+#endif
+xenbus_dev_init(void)
+{
+	xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
+	if (xenbus_dev_intf)
+		xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
+
+	return 0;
+}
diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c
index 3d3be78..e5d2994 100644
--- a/drivers/xen/xenbus/xenbus_dev_backend.c
+++ b/drivers/xen/xenbus/xenbus_dev_backend.c
@@ -7,7 +7,9 @@
 #include <linux/capability.h>
 
 #include <xen/xen.h>
+#ifdef CONFIG_PARAVIRT_XEN
 #include <xen/page.h>
+#endif
 #include <xen/xenbus_dev.h>
 
 #include "xenbus_comms.h"
@@ -49,7 +51,7 @@ static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 
 	if (remap_pfn_range(vma, vma->vm_start,
-			    virt_to_pfn(xen_store_interface),
+			    PFN_DOWN(__pa(xen_store_interface)),
 			    size, vma->vm_page_prot))
 		return -EAGAIN;
 
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 3864967..bbdd197 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2005 Rusty Russell, IBM Corporation
  * Copyright (C) 2005 Mike Wray, Hewlett-Packard
  * Copyright (C) 2005, 2006 XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License version 2
@@ -32,17 +33,18 @@
 
 #define DPRINTK(fmt, args...)				\
 	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
-		 __func__, __LINE__, ##args)
+		 __FUNCTION__, __LINE__, ##args)
 
 #include <linux/kernel.h>
+#include <linux/version.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/proc_fs.h>
 #include <linux/notifier.h>
-#include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/io.h>
 #include <linux/slab.h>
@@ -50,6 +52,16 @@
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/xen_proc.h>
+#include <xen/evtchn.h>
+#include <xen/features.h>
+#include <xen/gnttab.h>
+
+#define PARAVIRT_EXPORT_SYMBOL(sym) __typeof__(sym) sym
+#else
 #include <asm/xen/hypervisor.h>
 
 #include <xen/xen.h>
@@ -57,11 +69,19 @@
 #include <xen/events.h>
 #include <xen/page.h>
 
+#define PARAVIRT_EXPORT_SYMBOL EXPORT_SYMBOL_GPL
+#endif
+
+#ifndef CONFIG_XEN
 #include <xen/hvm.h>
+#endif
 
 #include "xenbus_comms.h"
 #include "xenbus_probe.h"
 
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
 
 int xen_store_evtchn;
 EXPORT_SYMBOL_GPL(xen_store_evtchn);
@@ -71,7 +91,17 @@ EXPORT_SYMBOL_GPL(xen_store_interface);
 
 static unsigned long xen_store_mfn;
 
-static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+extern struct mutex xenwatch_mutex;
+
+static
+#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST
+__initdata
+#endif
+BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+static void wait_for_devices(struct xenbus_driver *xendrv);
+#endif
 
 /* If something in array of ids matches this device, return it. */
 static const struct xenbus_device_id *
@@ -93,7 +123,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)
 
 	return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
 }
-EXPORT_SYMBOL_GPL(xenbus_match);
+PARAVIRT_EXPORT_SYMBOL(xenbus_match);
 
 
 static void free_otherend_details(struct xenbus_device *dev)
@@ -113,29 +143,6 @@ static void free_otherend_watch(struct xenbus_device *dev)
 }
 
 
-static int talk_to_otherend(struct xenbus_device *dev)
-{
-	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
-
-	free_otherend_watch(dev);
-	free_otherend_details(dev);
-
-	return drv->read_otherend_details(dev);
-}
-
-
-
-static int watch_otherend(struct xenbus_device *dev)
-{
-	struct xen_bus_type *bus =
-		container_of(dev->dev.bus, struct xen_bus_type, bus);
-
-	return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
-				    bus->otherend_changed,
-				    "%s/%s", dev->otherend, "state");
-}
-
-
 int xenbus_read_otherend_details(struct xenbus_device *xendev,
 				 char *id_node, char *path_node)
 {
@@ -161,11 +168,22 @@ int xenbus_read_otherend_details(struct xenbus_device *xendev,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
+PARAVIRT_EXPORT_SYMBOL(xenbus_read_otherend_details);
 
+#if defined(CONFIG_XEN) || defined(MODULE)
+
+static int read_backend_details(struct xenbus_device *xendev)
+{
+	return xenbus_read_otherend_details(xendev, "backend-id", "backend");
+}
+
+static void otherend_changed(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+#else /* !CONFIG_XEN && !MODULE */
 void xenbus_otherend_changed(struct xenbus_watch *watch,
 			     const char **vec, unsigned int len,
 			     int ignore_on_shutdown)
+#endif /* CONFIG_XEN || MODULE */
 {
 	struct xenbus_device *dev =
 		container_of(watch, struct xenbus_device, otherend_watch);
@@ -177,31 +195,69 @@ void xenbus_otherend_changed(struct xenbus_watch *watch,
 	if (!dev->otherend ||
 	    strncmp(dev->otherend, vec[XS_WATCH_PATH],
 		    strlen(dev->otherend))) {
-		dev_dbg(&dev->dev, "Ignoring watch at %s\n",
-			vec[XS_WATCH_PATH]);
+		dev_dbg(&dev->dev, "Ignoring watch at %s", vec[XS_WATCH_PATH]);
 		return;
 	}
 
 	state = xenbus_read_driver_state(dev->otherend);
 
-	dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
+	dev_dbg(&dev->dev, "state is %d (%s), %s, %s",
 		state, xenbus_strstate(state), dev->otherend_watch.node,
 		vec[XS_WATCH_PATH]);
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
 	/*
 	 * Ignore xenbus transitions during shutdown. This prevents us doing
 	 * work that can fail e.g., when the rootfs is gone.
 	 */
 	if (system_state > SYSTEM_RUNNING) {
+		/* If we're frontend, drive the state machine to Closed. */
+		/* This should cause the backend to release our resources. */
+# if defined(CONFIG_XEN) || defined(MODULE)
+		const struct xen_bus_type *bus =
+			container_of(dev->dev.bus, struct xen_bus_type, bus);
+		int ignore_on_shutdown = (bus->levels == 2);
+# endif
+
 		if (ignore_on_shutdown && (state == XenbusStateClosing))
 			xenbus_frontend_closed(dev);
 		return;
 	}
+#endif
 
 	if (drv->otherend_changed)
 		drv->otherend_changed(dev, state);
 }
-EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
+PARAVIRT_EXPORT_SYMBOL(xenbus_otherend_changed);
+
+
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+
+	free_otherend_watch(dev);
+	free_otherend_details(dev);
+
+	return drv->read_otherend_details(dev);
+}
+
+
+
+static int watch_otherend(struct xenbus_device *dev)
+{
+#if defined(CONFIG_XEN) || defined(MODULE)
+	return xenbus_watch_path2(dev, dev->otherend, "state",
+				  &dev->otherend_watch, otherend_changed);
+#else
+	struct xen_bus_type *bus =
+		container_of(dev->dev.bus, struct xen_bus_type, bus);
+
+	return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
+				    bus->otherend_changed,
+				    "%s/%s", dev->otherend, "state");
+#endif
+}
+
 
 int xenbus_dev_probe(struct device *_dev)
 {
@@ -225,8 +281,9 @@ int xenbus_dev_probe(struct device *_dev)
 
 	err = talk_to_otherend(dev);
 	if (err) {
-		dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
-			 dev->nodename);
+		dev_warn(&dev->dev,
+		         "xenbus_probe: talk_to_otherend on %s failed.\n",
+		         dev->nodename);
 		return err;
 	}
 
@@ -236,8 +293,9 @@ int xenbus_dev_probe(struct device *_dev)
 
 	err = watch_otherend(dev);
 	if (err) {
-		dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
-		       dev->nodename);
+		dev_warn(&dev->dev,
+		         "xenbus_probe: watch_otherend on %s failed.\n",
+		         dev->nodename);
 		return err;
 	}
 
@@ -245,9 +303,13 @@ int xenbus_dev_probe(struct device *_dev)
 fail:
 	xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
 	xenbus_switch_state(dev, XenbusStateClosed);
+#if defined(CONFIG_XEN) || defined(MODULE)
+	return -ENODEV;
+#else
 	return err;
+#endif
 }
-EXPORT_SYMBOL_GPL(xenbus_dev_probe);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_probe);
 
 int xenbus_dev_remove(struct device *_dev)
 {
@@ -265,7 +327,7 @@ int xenbus_dev_remove(struct device *_dev)
 	xenbus_switch_state(dev, XenbusStateClosed);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(xenbus_dev_remove);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_remove);
 
 void xenbus_dev_shutdown(struct device *_dev)
 {
@@ -274,30 +336,55 @@ void xenbus_dev_shutdown(struct device *_dev)
 
 	DPRINTK("%s", dev->nodename);
 
+/* Commented out since xenstored stubdom is now minios based not linux based
+#define XENSTORE_DOMAIN_SHARES_THIS_KERNEL
+*/
+#ifndef XENSTORE_DOMAIN_SHARES_THIS_KERNEL
+	if (is_initial_xendomain())
+#endif
+		return;
+
 	get_device(&dev->dev);
 	if (dev->state != XenbusStateConnected) {
-		printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
-		       dev->nodename, xenbus_strstate(dev->state));
+		dev_info(&dev->dev, "%s: %s: %s != Connected, skipping\n", __FUNCTION__,
+		         dev->nodename, xenbus_strstate(dev->state));
 		goto out;
 	}
 	xenbus_switch_state(dev, XenbusStateClosing);
+
+	if (!strcmp(dev->devicetype, "vfb"))
+		goto out;
+
 	timeout = wait_for_completion_timeout(&dev->down, timeout);
 	if (!timeout)
-		printk(KERN_INFO "%s: %s timeout closing device\n",
-		       __func__, dev->nodename);
+		dev_info(&dev->dev, "%s: %s timeout closing device\n",
+		         __FUNCTION__, dev->nodename);
  out:
 	put_device(&dev->dev);
 }
-EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_shutdown);
 
 int xenbus_register_driver_common(struct xenbus_driver *drv,
 				  struct xen_bus_type *bus)
 {
+	int ret;
+
+	if (bus->error)
+		return bus->error;
+
 	drv->driver.bus = &bus->bus;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+	drv->driver.probe = xenbus_dev_probe;
+	drv->driver.remove = xenbus_dev_remove;
+	drv->driver.shutdown = xenbus_dev_shutdown;
+#endif
 
-	return driver_register(&drv->driver);
+	mutex_lock(&xenwatch_mutex);
+	ret = driver_register(&drv->driver);
+	mutex_unlock(&xenwatch_mutex);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
+PARAVIRT_EXPORT_SYMBOL(xenbus_register_driver_common);
 
 void xenbus_unregister_driver(struct xenbus_driver *drv)
 {
@@ -374,19 +461,28 @@ static void xenbus_dev_release(struct device *dev)
 }
 
 static ssize_t nodename_show(struct device *dev,
-			     struct device_attribute *attr, char *buf)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+			     struct device_attribute *attr,
+#endif
+			     char *buf)
 {
 	return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
 }
 
 static ssize_t devtype_show(struct device *dev,
-			    struct device_attribute *attr, char *buf)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+			    struct device_attribute *attr,
+#endif
+			    char *buf)
 {
 	return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
 }
 
 static ssize_t modalias_show(struct device *dev,
-			     struct device_attribute *attr, char *buf)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+			     struct device_attribute *attr,
+#endif
+			     char *buf)
 {
 	return sprintf(buf, "%s:%s\n", dev->bus->name,
 		       to_xenbus_device(dev)->devicetype);
@@ -398,13 +494,12 @@ struct device_attribute xenbus_dev_attrs[] = {
 	__ATTR_RO(modalias),
 	__ATTR_NULL
 };
-EXPORT_SYMBOL_GPL(xenbus_dev_attrs);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_attrs);
 
 int xenbus_probe_node(struct xen_bus_type *bus,
 		      const char *type,
 		      const char *nodename)
 {
-	char devname[XEN_BUS_ID_SIZE];
 	int err;
 	struct xenbus_device *xendev;
 	size_t stringlen;
@@ -412,6 +507,9 @@ int xenbus_probe_node(struct xen_bus_type *bus,
 
 	enum xenbus_state state = xenbus_read_driver_state(nodename);
 
+	if (bus->error)
+		return bus->error;
+
 	if (state != XenbusStateInitialising) {
 		/* Device is not new, so ignore it.  This can happen if a
 		   device is going away after switching to Closed.  */
@@ -436,15 +534,26 @@ int xenbus_probe_node(struct xen_bus_type *bus,
 	xendev->devicetype = tmpstring;
 	init_completion(&xendev->down);
 
+#if defined(CONFIG_XEN) || defined(MODULE)
+	xendev->dev.parent = &bus->dev;
+#endif
 	xendev->dev.bus = &bus->bus;
 	xendev->dev.release = xenbus_dev_release;
 
-	err = bus->get_bus_id(devname, xendev->nodename);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
+	{
+		char devname[XEN_BUS_ID_SIZE];
+
+		err = bus->get_bus_id(devname, xendev->nodename);
+		if (!err)
+			dev_set_name(&xendev->dev, devname);
+	}
+#else
+	err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
+#endif
 	if (err)
 		goto fail;
 
-	dev_set_name(&xendev->dev, devname);
-
 	/* Register with generic device framework. */
 	err = device_register(&xendev->dev);
 	if (err)
@@ -455,7 +564,113 @@ fail:
 	kfree(xendev);
 	return err;
 }
-EXPORT_SYMBOL_GPL(xenbus_probe_node);
+PARAVIRT_EXPORT_SYMBOL(xenbus_probe_node);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+	nodename = strchr(nodename, '/');
+	if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+		pr_warning("XENBUS: bad frontend %s\n", nodename);
+		return -EINVAL;
+	}
+
+	strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+	if (!strchr(bus_id, '/')) {
+		pr_warning("XENBUS: bus_id %s no slash\n", bus_id);
+		return -EINVAL;
+	}
+	*strchr(bus_id, '/') = '-';
+	return 0;
+}
+
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type,
+				 const char *name)
+{
+	char *nodename;
+	int err;
+
+	if (!strcmp(type, "console"))
+		return 0;
+
+	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
+	if (!nodename)
+		return -ENOMEM;
+
+	DPRINTK("%s", nodename);
+
+	err = xenbus_probe_node(bus, type, nodename);
+	kfree(nodename);
+	return err;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct xenbus_device *xdev;
+
+	if (dev == NULL)
+		return -ENODEV;
+	xdev = to_xenbus_device(dev);
+	if (xdev == NULL)
+		return -ENODEV;
+
+	/* stuff we want to pass to /sbin/hotplug */
+	if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype) ||
+	    add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename) ||
+	    add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype))
+		return -ENOMEM;
+
+	return 0;
+}
+#endif
+
+/* Bus type for frontend drivers. */
+static struct xen_bus_type xenbus_frontend = {
+	.root = "device",
+	.levels = 2, 		/* device/type/<id> */
+	.get_bus_id = frontend_bus_id,
+	.probe = xenbus_probe_frontend,
+	.error = -ENODEV,
+	.bus = {
+		.name      = "xen",
+		.match     = xenbus_match,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+		.probe     = xenbus_dev_probe,
+		.remove    = xenbus_dev_remove,
+		.shutdown  = xenbus_dev_shutdown,
+		.uevent    = xenbus_uevent_frontend,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+		.dev_attrs = xenbus_dev_attrs,
+#endif
+	},
+	.dev = {
+		.init_name = "xen",
+	},
+};
+
+int xenbus_register_frontend(struct xenbus_driver *drv)
+{
+	int ret;
+
+	drv->read_otherend_details = read_backend_details;
+
+	ret = xenbus_register_driver_common(drv, &xenbus_frontend);
+	if (ret)
+		return ret;
+
+	/* If this driver is loaded as a module wait for devices to attach. */
+	wait_for_devices(drv);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_register_frontend);
+
+#endif
 
 static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
 {
@@ -484,6 +699,9 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
 	char **dir;
 	unsigned int i, dir_n;
 
+	if (bus->error)
+		return bus->error;
+
 	dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
 	if (IS_ERR(dir))
 		return PTR_ERR(dir);
@@ -497,7 +715,7 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
 	kfree(dir);
 	return err;
 }
-EXPORT_SYMBOL_GPL(xenbus_probe_devices);
+PARAVIRT_EXPORT_SYMBOL(xenbus_probe_devices);
 
 static unsigned int char_count(const char *str, char c)
 {
@@ -529,7 +747,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
 	char type[XEN_BUS_ID_SIZE];
 	const char *p, *root;
 
-	if (char_count(node, '/') < 2)
+	if (bus->error || char_count(node, '/') < 2)
 		return;
 
 	exists = xenbus_exists(XBT_NIL, node, "");
@@ -558,9 +776,27 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
 
 	kfree(root);
 }
-EXPORT_SYMBOL_GPL(xenbus_dev_changed);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_changed);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+static void frontend_changed(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	DPRINTK("");
+
+	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+}
 
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+	.node = "device",
+	.callback = frontend_changed,
+};
+
+static int suspend_dev(struct device *dev, void *data)
+#else
 int xenbus_dev_suspend(struct device *dev)
+#endif
 {
 	int err = 0;
 	struct xenbus_driver *drv;
@@ -575,13 +811,37 @@ int xenbus_dev_suspend(struct device *dev)
 	if (drv->suspend)
 		err = drv->suspend(xdev);
 	if (err)
-		printk(KERN_WARNING
-		       "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
+		pr_warning("xenbus: suspend %s failed: %i\n",
+			   dev_name(dev), err);
+	return 0;
+}
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_suspend);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+static int suspend_cancel_dev(struct device *dev, void *data)
+{
+	int err = 0;
+	struct xenbus_driver *drv;
+	struct xenbus_device *xdev;
+
+	DPRINTK("");
+
+	if (dev->driver == NULL)
+		return 0;
+	drv = to_xenbus_driver(dev->driver);
+	xdev = container_of(dev, struct xenbus_device, dev);
+	if (drv->suspend_cancel)
+		err = drv->suspend_cancel(xdev);
+	if (err)
+		pr_warning("xenbus: suspend_cancel %s failed: %i\n",
+			   dev_name(dev), err);
 	return 0;
 }
-EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
 
+static int resume_dev(struct device *dev, void *data)
+#else
 int xenbus_dev_resume(struct device *dev)
+#endif
 {
 	int err;
 	struct xenbus_driver *drv;
@@ -595,9 +855,8 @@ int xenbus_dev_resume(struct device *dev)
 	drv = to_xenbus_driver(dev->driver);
 	err = talk_to_otherend(xdev);
 	if (err) {
-		printk(KERN_WARNING
-		       "xenbus: resume (talk_to_otherend) %s failed: %i\n",
-		       dev_name(dev), err);
+		pr_warning("xenbus: resume (talk_to_otherend) %s failed: %i\n",
+			   dev_name(dev), err);
 		return err;
 	}
 
@@ -606,48 +865,80 @@ int xenbus_dev_resume(struct device *dev)
 	if (drv->resume) {
 		err = drv->resume(xdev);
 		if (err) {
-			printk(KERN_WARNING
-			       "xenbus: resume %s failed: %i\n",
-			       dev_name(dev), err);
+			pr_warning("xenbus: resume %s failed: %i\n",
+				   dev_name(dev), err);
 			return err;
 		}
 	}
 
 	err = watch_otherend(xdev);
 	if (err) {
-		printk(KERN_WARNING
-		       "xenbus_probe: resume (watch_otherend) %s failed: "
-		       "%d.\n", dev_name(dev), err);
+		pr_warning("xenbus_probe: resume (watch_otherend) %s failed:"
+			   " %d\n", dev_name(dev), err);
 		return err;
 	}
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(xenbus_dev_resume);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_resume);
 
+#if !defined(CONFIG_XEN) && !defined(MODULE)
 int xenbus_dev_cancel(struct device *dev)
 {
 	/* Do nothing */
 	DPRINTK("cancel");
 	return 0;
 }
-EXPORT_SYMBOL_GPL(xenbus_dev_cancel);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_cancel);
+#else
+void xenbus_suspend(void)
+{
+	DPRINTK("");
+
+	if (!xenbus_frontend.error)
+		bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
+	xenbus_backend_suspend(suspend_dev);
+	xs_suspend();
+}
+
+void xenbus_resume(void)
+{
+	xb_init_comms();
+	xs_resume();
+	if (!xenbus_frontend.error)
+		bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
+	xenbus_backend_resume(resume_dev);
+}
+
+void xenbus_suspend_cancel(void)
+{
+	xs_suspend_cancel();
+	if (!xenbus_frontend.error)
+		bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
+	xenbus_backend_resume(suspend_cancel_dev);
+}
+#endif
 
 /* A flag to determine if xenstored is 'ready' (i.e. has started) */
-int xenstored_ready;
+atomic_t xenbus_xsd_state = ATOMIC_INIT(XENBUS_XSD_UNCOMMITTED);
 
 
-int register_xenstore_notifier(struct notifier_block *nb)
+int
+#ifdef CONFIG_XEN
+__init
+#endif
+register_xenstore_notifier(struct notifier_block *nb)
 {
 	int ret = 0;
 
-	if (xenstored_ready > 0)
+	if (is_xenstored_ready())
 		ret = nb->notifier_call(nb, 0, NULL);
 	else
 		blocking_notifier_chain_register(&xenstore_chain, nb);
 
 	return ret;
 }
+#ifndef CONFIG_XEN
 EXPORT_SYMBOL_GPL(register_xenstore_notifier);
 
 void unregister_xenstore_notifier(struct notifier_block *nb)
@@ -655,16 +946,157 @@ void unregister_xenstore_notifier(struct notifier_block *nb)
 	blocking_notifier_chain_unregister(&xenstore_chain, nb);
 }
 EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
+#endif
 
-void xenbus_probe(struct work_struct *unused)
+#ifndef CONFIG_XEN
+static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
+static int backend_state;
+
+static void xenbus_reset_backend_state_changed(struct xenbus_watch *w,
+					const char **v, unsigned int l)
 {
-	xenstored_ready = 1;
+	if (xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state) != 1)
+		backend_state = XenbusStateUnknown;
+	printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+			v[XS_WATCH_PATH], xenbus_strstate(backend_state));
+	wake_up(&backend_state_wq);
+}
+
+static void xenbus_reset_wait_for_backend(char *be, int expected)
+{
+	long timeout;
+	timeout = wait_event_interruptible_timeout(backend_state_wq,
+			backend_state == expected, 5 * HZ);
+	if (timeout <= 0)
+		pr_info("XENBUS: backend %s timed out.\n", be);
+}
+
+/*
+ * Reset frontend if it is in Connected or Closed state.
+ * Wait for backend to catch up.
+ * State Connected happens during kdump, Closed after kexec.
+ */
+static void xenbus_reset_frontend(char *fe, char *be, int be_state)
+{
+	struct xenbus_watch be_watch;
+
+	printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+			be, xenbus_strstate(be_state));
+
+	memset(&be_watch, 0, sizeof(be_watch));
+	be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be);
+	if (!be_watch.node)
+		return;
+
+	be_watch.callback = xenbus_reset_backend_state_changed;
+	backend_state = XenbusStateUnknown;
+
+	pr_info("XENBUS: triggering reconnect on %s\n", be);
+	register_xenbus_watch(&be_watch);
+
+	/* fall through to forward backend to state XenbusStateInitialising */
+	switch (be_state) {
+	case XenbusStateConnected:
+		xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing);
+		xenbus_reset_wait_for_backend(be, XenbusStateClosing);
+
+	case XenbusStateClosing:
+		xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed);
+		xenbus_reset_wait_for_backend(be, XenbusStateClosed);
+
+	case XenbusStateClosed:
+		xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising);
+		xenbus_reset_wait_for_backend(be, XenbusStateInitWait);
+	}
+
+	unregister_xenbus_watch(&be_watch);
+	pr_info("XENBUS: reconnect done on %s\n", be);
+	kfree(be_watch.node);
+}
+
+static void xenbus_check_frontend(char *class, char *dev)
+{
+	int be_state, fe_state, err;
+	char *backend, *frontend;
+
+	frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev);
+	if (!frontend)
+		return;
+
+	err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state);
+	if (err != 1)
+		goto out;
+
+	switch (fe_state) {
+	case XenbusStateConnected:
+	case XenbusStateClosed:
+		printk(KERN_DEBUG "XENBUS: frontend %s %s\n",
+				frontend, xenbus_strstate(fe_state));
+		backend = xenbus_read(XBT_NIL, frontend, "backend", NULL);
+		if (!backend || IS_ERR(backend))
+			goto out;
+		err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state);
+		if (err == 1)
+			xenbus_reset_frontend(frontend, backend, be_state);
+		kfree(backend);
+		break;
+	default:
+		break;
+	}
+out:
+	kfree(frontend);
+}
+
+static void xenbus_reset_state(void)
+{
+	char **devclass, **dev;
+	int devclass_n, dev_n;
+	int i, j;
+
+	devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n);
+	if (IS_ERR(devclass))
+		return;
+
+	for (i = 0; i < devclass_n; i++) {
+		dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n);
+		if (IS_ERR(dev))
+			continue;
+		for (j = 0; j < dev_n; j++)
+			xenbus_check_frontend(devclass[i], dev[j]);
+		kfree(dev);
+	}
+	kfree(devclass);
+}
+#endif
+
+void
+#if defined(CONFIG_XEN_UNPRIVILEGED_GUEST)
+__init
+#elif defined(MODULE)
+__devinit
+#endif
+xenbus_probe(struct work_struct *unused)
+{
+	BUG_ON(!is_xenstored_ready());
+
+#ifndef CONFIG_XEN
+	/* reset devices in Connected or Closed state */
+	xenbus_reset_state();
+#endif
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+	/* Enumerate devices in xenstore and watch for changes. */
+	xenbus_probe_devices(&xenbus_frontend);
+	register_xenbus_watch(&fe_watch);
+	xenbus_backend_probe_and_watch();
+#endif
 
 	/* Notify others that xenstore is up */
 	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
 }
-EXPORT_SYMBOL_GPL(xenbus_probe);
+PARAVIRT_EXPORT_SYMBOL(xenbus_probe);
 
+#if !defined(CONFIG_XEN) && !defined(MODULE)
 static int __init xenbus_probe_initcall(void)
 {
 	if (!xen_domain())
@@ -678,6 +1110,120 @@ static int __init xenbus_probe_initcall(void)
 }
 
 device_initcall(xenbus_probe_initcall);
+#endif
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#ifdef CONFIG_PROC_FS
+static struct file_operations xsd_kva_fops;
+static struct proc_dir_entry *xsd_kva_intf;
+static struct proc_dir_entry *xsd_port_intf;
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size = vma->vm_end - vma->vm_start;
+	int old;
+	int rc;
+
+	old = atomic_cmpxchg(&xenbus_xsd_state,
+	                   XENBUS_XSD_UNCOMMITTED,
+	                   XENBUS_XSD_LOCAL_INIT);
+	switch (old) {
+		case XENBUS_XSD_UNCOMMITTED:
+			rc = xb_init_comms();
+			if (rc != 0)
+				return rc;
+			break;
+
+		case XENBUS_XSD_FOREIGN_INIT:
+		case XENBUS_XSD_FOREIGN_READY:
+			return -EBUSY;
+
+		case XENBUS_XSD_LOCAL_INIT:
+		case XENBUS_XSD_LOCAL_READY:
+		default:
+			break;
+	}
+
+	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+		return -EINVAL;
+
+	if (remap_pfn_range(vma, vma->vm_start, mfn_to_pfn(xen_store_mfn),
+			    size, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static int xsd_kva_read(char *page, char **start, off_t off,
+			int count, int *eof, void *data)
+{
+	int len;
+
+	len  = sprintf(page, "0x%p", xen_store_interface);
+	*eof = 1;
+	return len;
+}
+
+static int xsd_port_read(char *page, char **start, off_t off,
+			 int count, int *eof, void *data)
+{
+	int len;
+
+	len  = sprintf(page, "%d", xen_store_evtchn);
+	*eof = 1;
+	return len;
+}
+#endif
+
+#ifdef CONFIG_XEN_XENBUS_DEV
+int xenbus_conn(domid_t remote_dom, grant_ref_t *grant_ref,
+		evtchn_port_t *local_port)
+{
+	struct evtchn_alloc_unbound alloc_unbound;
+	int rc, rc2;
+
+	BUG_ON(atomic_read(&xenbus_xsd_state) != XENBUS_XSD_FOREIGN_INIT);
+	BUG_ON(!is_initial_xendomain());
+
+	remove_xen_proc_entry("xsd_kva");
+	remove_xen_proc_entry("xsd_port");
+
+	rc = close_evtchn(xen_store_evtchn);
+	if (rc != 0)
+		goto fail0;
+
+	alloc_unbound.dom = DOMID_SELF;
+	alloc_unbound.remote_dom = remote_dom;
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+	                                 &alloc_unbound);
+	if (rc != 0)
+		goto fail0;
+	*local_port = xen_store_evtchn = alloc_unbound.port;
+
+	/* keep the old page (xen_store_mfn, xen_store_interface) */
+	rc = gnttab_grant_foreign_access(remote_dom, xen_store_mfn,
+	                                 GTF_permit_access);
+	if (rc < 0)
+		goto fail1;
+	*grant_ref = rc;
+
+	rc = xb_init_comms();
+	if (rc != 0)
+		goto fail1;
+
+	return 0;
+
+fail1:
+	rc2 = close_evtchn(xen_store_evtchn);
+	if (rc2 != 0)
+		pr_warning("XENBUS: Error freeing xenstore event channel:"
+			   " %d\n", rc2);
+fail0:
+	xen_store_evtchn = -1;
+	return rc;
+}
+#endif
+#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
 
 /* Set up event channel for xenstored which is run as a local process
  * (this is normally used only in dom0)
@@ -718,13 +1264,81 @@ static int __init xenstored_local_init(void)
 	return err;
 }
 
-static int __init xenbus_init(void)
+#ifndef MODULE
+static int __init
+#else
+int __devinit
+#endif
+xenbus_init(void)
 {
 	int err = 0;
 
-	if (!xen_domain())
+	DPRINTK("");
+
+	if (!is_running_on_xen())
 		return -ENODEV;
 
+#if defined(CONFIG_XEN) || defined(MODULE)
+	/* Register ourselves with the kernel bus subsystem */
+	xenbus_frontend.error = bus_register(&xenbus_frontend.bus);
+	if (xenbus_frontend.error)
+		pr_warning("XENBUS: Error registering frontend bus: %i\n",
+			   xenbus_frontend.error);
+	xenbus_backend_bus_register();
+
+	/*
+	 * Domain0 doesn't have a store_evtchn or store_mfn yet.
+	 */
+	if (is_initial_xendomain()) {
+		err = xenstored_local_init();
+		if (err)
+			goto out_error;
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
+		/* And finally publish the above info in /proc/xen */
+		xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
+		if (xsd_kva_intf) {
+			memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
+			       sizeof(xsd_kva_fops));
+			xsd_kva_fops.mmap = xsd_kva_mmap;
+			xsd_kva_intf->proc_fops = &xsd_kva_fops;
+			xsd_kva_intf->read_proc = xsd_kva_read;
+		}
+		xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
+		if (xsd_port_intf)
+			xsd_port_intf->read_proc = xsd_port_read;
+#endif
+		xen_store_interface = mfn_to_virt(xen_store_mfn);
+	} else {
+#ifndef CONFIG_XEN
+		uint64_t v = 0;
+
+		err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+		if (err)
+			goto out_error;
+		xen_store_evtchn = (int)v;
+		err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+		if (err)
+			goto out_error;
+		xen_store_mfn = (unsigned long)v;
+		xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT,
+					      PAGE_SIZE);
+#endif
+#ifndef MODULE
+		xen_store_evtchn = xen_start_info->store_evtchn;
+		xen_store_mfn = xen_start_info->store_mfn;
+		xen_store_interface = mfn_to_virt(xen_store_mfn);
+#endif
+		atomic_set(&xenbus_xsd_state, XENBUS_XSD_FOREIGN_READY);
+
+		/* Initialize the shared memory rings to talk to xenstored */
+		err = xb_init_comms();
+		if (err)
+			goto out_error;
+	}
+
+	xenbus_dev_init();
+#else /* !defined(CONFIG_XEN) && !defined(MODULE) */
 	xenbus_ring_ops_init();
 
 	if (xen_hvm_domain()) {
@@ -742,7 +1356,7 @@ static int __init xenbus_init(void)
 		xen_store_evtchn = xen_start_info->store_evtchn;
 		xen_store_mfn = xen_start_info->store_mfn;
 		if (xen_store_evtchn)
-			xenstored_ready = 1;
+			atomic_set(&xenbus_xsd_state, XENBUS_XSD_FOREIGN_READY);
 		else {
 			err = xenstored_local_init();
 			if (err)
@@ -750,16 +1364,33 @@ static int __init xenbus_init(void)
 		}
 		xen_store_interface = mfn_to_virt(xen_store_mfn);
 	}
+#endif
 
 	/* Initialize the interface to xenstore. */
 	err = xs_init();
 	if (err) {
-		printk(KERN_WARNING
-		       "XENBUS: Error initializing xenstore comms: %i\n", err);
+		pr_warning("XENBUS: Error initializing xenstore comms: %i\n",
+			   err);
 		goto out_error;
 	}
 
-#ifdef CONFIG_XEN_COMPAT_XENFS
+#if defined(CONFIG_XEN) || defined(MODULE)
+	/* Register ourselves with the kernel device subsystem */
+	if (!xenbus_frontend.error) {
+		xenbus_frontend.error = device_register(&xenbus_frontend.dev);
+		if (xenbus_frontend.error) {
+			bus_unregister(&xenbus_frontend.bus);
+			pr_warning("XENBUS: Error registering frontend device:"
+				   " %d\n", xenbus_frontend.error);
+		}
+	}
+	xenbus_backend_device_register();
+
+	if (!is_initial_xendomain())
+		xenbus_probe(NULL);
+#endif
+
+#if defined(CONFIG_XEN_COMPAT_XENFS) && !defined(MODULE)
 	/*
 	 * Create xenfs mountpoint in /proc for compatibility with
 	 * utilities that expect to find "xenbus" under "/proc/xen".
@@ -767,10 +1398,162 @@ static int __init xenbus_init(void)
 	proc_mkdir("xen", NULL);
 #endif
 
+	return 0;
+
 out_error:
+	/*
+	 * Do not unregister the xenbus front/backend buses here. The buses
+	 * must exist because front/backend drivers will use them when they are
+	 * registered.
+	 */
 	return err;
 }
 
+#ifndef MODULE
 postcore_initcall(xenbus_init);
-
+#ifdef CONFIG_XEN
+MODULE_LICENSE("Dual BSD/GPL");
+#else
 MODULE_LICENSE("GPL");
+#endif
+#endif
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+
+static int is_device_connecting(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct device_driver *drv = data;
+	struct xenbus_driver *xendrv;
+
+	/*
+	 * A device with no driver will never connect. We care only about
+	 * devices which should currently be in the process of connecting.
+	 */
+	if (!dev->driver)
+		return 0;
+
+	/* Is this search limited to a particular driver? */
+	if (drv && (dev->driver != drv))
+		return 0;
+
+	xendrv = to_xenbus_driver(dev->driver);
+	return (xendev->state < XenbusStateConnected ||
+		(xendev->state == XenbusStateConnected &&
+		 xendrv->is_ready && !xendrv->is_ready(xendev)));
+}
+
+static int exists_connecting_device(struct device_driver *drv)
+{
+	if (xenbus_frontend.error)
+		return xenbus_frontend.error;
+	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+				is_device_connecting);
+}
+
+static int print_device_status(struct device *dev, void *data)
+{
+	struct xenbus_device *xendev = to_xenbus_device(dev);
+	struct device_driver *drv = data;
+	struct xenbus_driver *xendrv;
+
+	/* Is this operation limited to a particular driver? */
+	if (drv && (dev->driver != drv))
+		return 0;
+
+	if (!dev->driver) {
+		/* Information only: is this too noisy? */
+		pr_info("XENBUS: Device with no driver: %s\n",
+			xendev->nodename);
+		return 0;
+	}
+
+	if (xendev->state < XenbusStateConnected) {
+		enum xenbus_state rstate = XenbusStateUnknown;
+		if (xendev->otherend)
+			rstate = xenbus_read_driver_state(xendev->otherend);
+		pr_warning("XENBUS: Timeout connecting to device: %s"
+			   " (local state %d, remote state %d)\n",
+			   xendev->nodename, xendev->state, rstate);
+	}
+
+	xendrv = to_xenbus_driver(dev->driver);
+	if (xendrv->is_ready && !xendrv->is_ready(xendev))
+		pr_warning("XENBUS: Device not ready: %s\n",
+			   xendev->nodename);
+
+	return 0;
+}
+
+/* We only wait for device setup after most initcalls have run. */
+static int ready_to_wait_for_devices;
+
+/*
+ * On a 5-minute timeout, wait for all devices currently configured.  We need
+ * to do this to guarantee that the filesystems and / or network devices
+ * needed for boot are available, before we can allow the boot to proceed.
+ *
+ * This needs to be on a late_initcall, to happen after the frontend device
+ * drivers have been initialised, but before the root fs is mounted.
+ *
+ * A possible improvement here would be to have the tools add a per-device
+ * flag to the store entry, indicating whether it is needed at boot time.
+ * This would allow people who knew what they were doing to accelerate their
+ * boot slightly, but of course needs tools or manual intervention to set up
+ * those flags correctly.
+ */
+static void wait_for_devices(struct xenbus_driver *xendrv)
+{
+	unsigned long start = jiffies;
+	struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+	unsigned int seconds_waited = 0;
+
+	if (!ready_to_wait_for_devices || !is_running_on_xen())
+		return;
+
+	while (exists_connecting_device(drv)) {
+		if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+			if (!seconds_waited)
+				pr_warning("XENBUS: Waiting for "
+					   "devices to initialise: ");
+			seconds_waited += 5;
+			printk("%us...", 300 - seconds_waited);
+			if (seconds_waited == 300)
+				break;
+		}
+
+		schedule_timeout_interruptible(HZ/10);
+	}
+
+	if (seconds_waited)
+		printk("\n");
+
+	bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+			 print_device_status);
+}
+
+#ifndef MODULE
+static int __init boot_wait_for_devices(void)
+{
+#if !defined(CONFIG_XEN) && !defined(MODULE)
+	if (xen_hvm_domain() && !xen_platform_pci_unplug)
+		return -ENODEV;
+#endif
+
+	if (!xenbus_frontend.error) {
+		ready_to_wait_for_devices = 1;
+		wait_for_devices(NULL);
+	}
+	return 0;
+}
+
+late_initcall(boot_wait_for_devices);
+#endif
+
+int xenbus_for_each_frontend(void *arg, int (*fn)(struct device *, void *))
+{
+	return bus_for_each_dev(&xenbus_frontend.bus, NULL, arg, fn);
+}
+EXPORT_SYMBOL_GPL(xenbus_for_each_frontend);
+
+#endif /* CONFIG_XEN || MODULE */
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
index bb4f92e..4b3c63f 100644
--- a/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -34,16 +34,46 @@
 #ifndef _XENBUS_PROBE_H
 #define _XENBUS_PROBE_H
 
+#ifndef BUS_ID_SIZE
 #define XEN_BUS_ID_SIZE			20
+#else
+#define XEN_BUS_ID_SIZE			BUS_ID_SIZE
+#endif
+
+#ifdef CONFIG_PARAVIRT_XEN
+#define is_running_on_xen() xen_domain()
+#define is_initial_xendomain() xen_initial_domain()
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+#define dev_name(dev) ((dev)->bus_id)
+#endif
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
+extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
+extern void xenbus_backend_probe_and_watch(void);
+extern void xenbus_backend_bus_register(void);
+extern void xenbus_backend_device_register(void);
+#else
+static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
+static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
+static inline void xenbus_backend_probe_and_watch(void) {}
+static inline void xenbus_backend_bus_register(void) {}
+static inline void xenbus_backend_device_register(void) {}
+#endif
 
 struct xen_bus_type {
 	char *root;
+	int error;
 	unsigned int levels;
 	int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
 	int (*probe)(struct xen_bus_type *bus, const char *type,
 		     const char *dir);
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
 	void (*otherend_changed)(struct xenbus_watch *watch, const char **vec,
 				 unsigned int len);
+#else
+	struct device dev;
+#endif
 	struct bus_type bus;
 };
 
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
index 257be37..51de88f 100644
--- a/drivers/xen/xenbus/xenbus_probe_backend.c
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -36,24 +36,36 @@
 		 __func__, __LINE__, ##args)
 
 #include <linux/kernel.h>
+#include <linux/version.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/notifier.h>
 #include <linux/export.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
 #include <asm/xen/hypervisor.h>
+#endif
 #include <asm/hypervisor.h>
 #include <xen/xenbus.h>
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+#include <xen/xen_proc.h>
+#include <xen/evtchn.h>
+#endif
 #include <xen/features.h>
 
 #include "xenbus_comms.h"
 #include "xenbus_probe.h"
 
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
 /* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
 static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
 {
@@ -179,25 +191,36 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type,
 	return err;
 }
 
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
 static void frontend_changed(struct xenbus_watch *watch,
 			    const char **vec, unsigned int len)
 {
 	xenbus_otherend_changed(watch, vec, len, 0);
 }
+#endif
 
 static struct xen_bus_type xenbus_backend = {
 	.root = "backend",
 	.levels = 3,		/* backend/type/<frontend>/<id> */
 	.get_bus_id = backend_bus_id,
 	.probe = xenbus_probe_backend,
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
 	.otherend_changed = frontend_changed,
+#else
+	.dev = {
+		.init_name = "xen-backend",
+	},
+#endif
+	.error = -ENODEV,
 	.bus = {
 		.name		= "xen-backend",
 		.match		= xenbus_match,
 		.uevent		= xenbus_uevent_backend,
 		.probe		= xenbus_dev_probe,
 		.remove		= xenbus_dev_remove,
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
 		.shutdown	= xenbus_dev_shutdown,
+#endif
 		.dev_attrs	= xenbus_dev_attrs,
 	},
 };
@@ -220,6 +243,7 @@ static int read_frontend_details(struct xenbus_device *xendev)
 	return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
 }
 
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
 int xenbus_dev_is_online(struct xenbus_device *dev)
 {
 	int rc, val;
@@ -231,6 +255,7 @@ int xenbus_dev_is_online(struct xenbus_device *dev)
 	return val;
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+#endif
 
 int xenbus_register_backend(struct xenbus_driver *drv)
 {
@@ -240,17 +265,43 @@ int xenbus_register_backend(struct xenbus_driver *drv)
 }
 EXPORT_SYMBOL_GPL(xenbus_register_backend);
 
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+
+void xenbus_backend_suspend(int (*fn)(struct device *, void *))
+{
+	DPRINTK("");
+	if (!xenbus_backend.error)
+		bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
+}
+
+void xenbus_backend_resume(int (*fn)(struct device *, void *))
+{
+	DPRINTK("");
+	if (!xenbus_backend.error)
+		bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
+}
+
+#endif
+
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
 static int backend_probe_and_watch(struct notifier_block *notifier,
 				   unsigned long event,
 				   void *data)
+#else
+void xenbus_backend_probe_and_watch(void)
+#endif
 {
 	/* Enumerate devices in xenstore and watch for changes. */
 	xenbus_probe_devices(&xenbus_backend);
 	register_xenbus_watch(&be_watch);
 
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
 	return NOTIFY_DONE;
+#endif
 }
 
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
+
 static int __init xenbus_probe_backend_init(void)
 {
 	static struct notifier_block xenstore_notifier = {
@@ -270,3 +321,34 @@ static int __init xenbus_probe_backend_init(void)
 	return 0;
 }
 subsys_initcall(xenbus_probe_backend_init);
+
+#else
+
+void xenbus_backend_bus_register(void)
+{
+	xenbus_backend.error = bus_register(&xenbus_backend.bus);
+	if (xenbus_backend.error)
+		pr_warning("XENBUS: Error registering backend bus: %i\n",
+			   xenbus_backend.error);
+}
+
+void xenbus_backend_device_register(void)
+{
+	if (xenbus_backend.error)
+		return;
+
+	xenbus_backend.error = device_register(&xenbus_backend.dev);
+	if (xenbus_backend.error) {
+		bus_unregister(&xenbus_backend.bus);
+		pr_warning("XENBUS: Error registering backend device: %i\n",
+			   xenbus_backend.error);
+	}
+}
+
+int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *))
+{
+	return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn);
+}
+EXPORT_SYMBOL_GPL(xenbus_for_each_backend);
+
+#endif
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index d1c217b..a723b8b 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -48,6 +48,14 @@
 #include <xen/xen.h>
 #include "xenbus_comms.h"
 
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#ifndef PF_NOFREEZE /* Old kernel (pre-2.6.6). */
+#define PF_NOFREEZE	0
+#endif
+
 struct xs_stored_msg {
 	struct list_head list;
 
@@ -119,7 +127,7 @@ static DEFINE_SPINLOCK(watch_events_lock);
  * carrying out work.
  */
 static pid_t xenwatch_pid;
-static DEFINE_MUTEX(xenwatch_mutex);
+/* static */ DEFINE_MUTEX(xenwatch_mutex);
 static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
 
 static int get_error(const char *errorstring)
@@ -128,9 +136,8 @@ static int get_error(const char *errorstring)
 
 	for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
 		if (i == ARRAY_SIZE(xsd_errors) - 1) {
-			printk(KERN_WARNING
-			       "XENBUS xen store gave: unknown error %s",
-			       errorstring);
+			pr_warning("XENBUS xen store gave: unknown error %s",
+				   errorstring);
 			return EINVAL;
 		}
 	}
@@ -196,10 +203,10 @@ static void transaction_resume(void)
 void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
 {
 	void *ret;
-	struct xsd_sockmsg req_msg = *msg;
+	enum xsd_sockmsg_type type = msg->type;
 	int err;
 
-	if (req_msg.type == XS_TRANSACTION_START)
+	if (type == XS_TRANSACTION_START)
 		transaction_start();
 
 	mutex_lock(&xs_state.request_mutex);
@@ -213,14 +220,15 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
 
 	mutex_unlock(&xs_state.request_mutex);
 
-	if ((msg->type == XS_TRANSACTION_END) ||
-	    ((req_msg.type == XS_TRANSACTION_START) &&
-	     (msg->type == XS_ERROR)))
+	if ((type == XS_TRANSACTION_END) ||
+	    ((type == XS_TRANSACTION_START) && (msg->type == XS_ERROR)))
 		transaction_end();
 
 	return ret;
 }
+#if !defined(CONFIG_XEN) && !defined(MODULE)
 EXPORT_SYMBOL(xenbus_dev_request_and_reply);
+#endif
 
 /* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
 static void *xs_talkv(struct xenbus_transaction t,
@@ -272,9 +280,9 @@ static void *xs_talkv(struct xenbus_transaction t,
 
 	if (msg.type != type) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING
-			       "XENBUS unexpected type [%d], expected [%d]\n",
-			       msg.type, type);
+			pr_warning("XENBUS unexpected type [%d],"
+				   " expected [%d]\n",
+				   msg.type, type);
 		kfree(ret);
 		return ERR_PTR(-EINVAL);
 	}
@@ -331,7 +339,7 @@ static char **split(char *strings, unsigned int len, unsigned int *num)
 	char *p, **ret;
 
 	/* Count the strings. */
-	*num = count_strings(strings, len);
+	*num = count_strings(strings, len) + 1;
 
 	/* Transfer to one big alloc for easy freeing. */
 	ret = kmalloc(*num * sizeof(char *) + len, GFP_NOIO | __GFP_HIGH);
@@ -345,6 +353,7 @@ static char **split(char *strings, unsigned int len, unsigned int *num)
 	strings = (char *)&ret[*num];
 	for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
 		ret[(*num)++] = p;
+	ret[*num] = strings + len;
 
 	return ret;
 }
@@ -532,18 +541,18 @@ int xenbus_printf(struct xenbus_transaction t,
 {
 	va_list ap;
 	int ret;
-	char *buf;
+	char *printf_buffer;
 
 	va_start(ap, fmt);
-	buf = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap);
+	printf_buffer = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap);
 	va_end(ap);
 
-	if (!buf)
+	if (!printf_buffer)
 		return -ENOMEM;
 
-	ret = xenbus_write(t, dir, node, buf);
+	ret = xenbus_write(t, dir, node, printf_buffer);
 
-	kfree(buf);
+	kfree(printf_buffer);
 
 	return ret;
 }
@@ -618,6 +627,23 @@ static struct xenbus_watch *find_watch(const char *token)
 	return NULL;
 }
 
+static void xs_reset_watches(void)
+{
+#ifdef MODULE
+	int err, supported = 0;
+
+	err = xenbus_scanf(XBT_NIL, "control",
+			   "platform-feature-xs_reset_watches", "%d",
+			   &supported);
+	if (err != 1 || !supported)
+		return;
+
+	err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
+	if (err && err != -EEXIST)
+		printk(KERN_WARNING "xs_reset_watches failed: %d\n", err);
+#endif
+}
+
 /* Register callback to watch this node. */
 int register_xenbus_watch(struct xenbus_watch *watch)
 {
@@ -654,6 +680,10 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
 	char token[sizeof(watch) * 2 + 1];
 	int err;
 
+#if defined(CONFIG_XEN) || defined(MODULE)
+	BUG_ON(watch->flags & XBWF_new_thread);
+#endif
+
 	sprintf(token, "%lX", (long)watch);
 
 	down_read(&xs_state.watch_mutex);
@@ -665,9 +695,8 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
 
 	err = xs_unwatch(watch->node, token);
 	if (err)
-		printk(KERN_WARNING
-		       "XENBUS Failed to release watch %s: %i\n",
-		       watch->node, err);
+		pr_warning("XENBUS Failed to release watch %s: %i\n",
+			   watch->node, err);
 
 	up_read(&xs_state.watch_mutex);
 
@@ -705,7 +734,9 @@ void xs_resume(void)
 	struct xenbus_watch *watch;
 	char token[sizeof(watch) * 2 + 1];
 
+#if !defined(CONFIG_XEN) && !defined(MODULE)
 	xb_init_comms();
+#endif
 
 	mutex_unlock(&xs_state.response_mutex);
 	mutex_unlock(&xs_state.request_mutex);
@@ -728,11 +759,32 @@ void xs_suspend_cancel(void)
 	mutex_unlock(&xs_state.transaction_mutex);
 }
 
+#if defined(CONFIG_XEN) || defined(MODULE)
+static int xenwatch_handle_callback(void *data)
+{
+	struct xs_stored_msg *msg = data;
+
+	msg->u.watch.handle->callback(msg->u.watch.handle,
+				      (const char **)msg->u.watch.vec,
+				      msg->u.watch.vec_size);
+
+	kfree(msg->u.watch.vec);
+	kfree(msg);
+
+	/* Kill this kthread if we were spawned just for this callback. */
+	if (current->pid != xenwatch_pid)
+		do_exit(0);
+
+	return 0;
+}
+#endif
+
 static int xenwatch_thread(void *unused)
 {
 	struct list_head *ent;
 	struct xs_stored_msg *msg;
 
+	current->flags |= PF_NOFREEZE;
 	for (;;) {
 		wait_event_interruptible(watch_events_waitq,
 					 !list_empty(&watch_events));
@@ -748,17 +800,39 @@ static int xenwatch_thread(void *unused)
 			list_del(ent);
 		spin_unlock(&watch_events_lock);
 
-		if (ent != &watch_events) {
-			msg = list_entry(ent, struct xs_stored_msg, list);
-			msg->u.watch.handle->callback(
-				msg->u.watch.handle,
-				(const char **)msg->u.watch.vec,
-				msg->u.watch.vec_size);
-			kfree(msg->u.watch.vec);
-			kfree(msg);
+		if (ent == &watch_events) {
+			mutex_unlock(&xenwatch_mutex);
+			continue;
 		}
 
+		msg = list_entry(ent, struct xs_stored_msg, list);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+		/*
+		 * Unlock the mutex before running an XBWF_new_thread
+		 * handler. kthread_run can block which can deadlock
+		 * against unregister_xenbus_watch() if we need to
+		 * unregister other watches in order to make
+		 * progress. This can occur on resume before the swap
+		 * device is attached.
+		 */
+		if (msg->u.watch.handle->flags & XBWF_new_thread) {
+			mutex_unlock(&xenwatch_mutex);
+			kthread_run(xenwatch_handle_callback,
+				    msg, "xenwatch_cb");
+		} else {
+			xenwatch_handle_callback(msg);
+			mutex_unlock(&xenwatch_mutex);
+		}
+#else
+		msg->u.watch.handle->callback(
+			msg->u.watch.handle,
+			(const char **)msg->u.watch.vec,
+			msg->u.watch.vec_size);
 		mutex_unlock(&xenwatch_mutex);
+		kfree(msg->u.watch.vec);
+		kfree(msg);
+#endif
 	}
 
 	return 0;
@@ -858,11 +932,12 @@ static int xenbus_thread(void *unused)
 {
 	int err;
 
+	current->flags |= PF_NOFREEZE;
 	for (;;) {
 		err = process_msg();
 		if (err)
-			printk(KERN_WARNING "XENBUS error %d while reading "
-			       "message\n", err);
+			pr_warning("XENBUS error %d while reading "
+				   "message\n", err);
 		if (kthread_should_stop())
 			break;
 	}
@@ -872,7 +947,6 @@ static int xenbus_thread(void *unused)
 
 int xs_init(void)
 {
-	int err;
 	struct task_struct *task;
 
 	INIT_LIST_HEAD(&xs_state.reply_list);
@@ -886,11 +960,6 @@ int xs_init(void)
 	atomic_set(&xs_state.transaction_count, 0);
 	init_waitqueue_head(&xs_state.transaction_wq);
 
-	/* Initialize the shared memory rings to talk to xenstored */
-	err = xb_init_comms();
-	if (err)
-		return err;
-
 	task = kthread_run(xenwatch_thread, NULL, "xenwatch");
 	if (IS_ERR(task))
 		return PTR_ERR(task);
@@ -900,5 +969,8 @@ int xs_init(void)
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 
+	/* shutdown watches for kexec boot */
+	xs_reset_watches();
+
 	return 0;
 }
diff --git a/drivers/xen/xenoprof/xenoprofile.c b/drivers/xen/xenoprof/xenoprofile.c
new file mode 100644
index 0000000..80541d9
--- /dev/null
+++ b/drivers/xen/xenoprof/xenoprofile.c
@@ -0,0 +1,585 @@
+/**
+ * @file xenoprofile.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon and Jose Renato Santos for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
+ * Separated out arch-generic part
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/oprofile.h>
+#include <linux/syscore_ops.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/vmalloc.h>
+#include <asm/pgtable.h>
+#include <xen/evtchn.h>
+#include <xen/xenoprof.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/xenoprof.h>
+#include "../../../drivers/oprofile/event_buffer.h"
+
+#define MAX_XENOPROF_SAMPLES 16
+
+/* sample buffers shared with Xen */
+static xenoprof_buf_t **__read_mostly xenoprof_buf;
+/* Shared buffer area */
+static struct xenoprof_shared_buffer shared_buffer;
+
+/* Passive sample buffers shared with Xen */
+static xenoprof_buf_t **__read_mostly p_xenoprof_buf[MAX_OPROF_DOMAINS];
+/* Passive shared buffer area */
+static struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS];
+
+static int xenoprof_start(void);
+static void xenoprof_stop(void);
+
+static int xenoprof_enabled = 0;
+static int xenoprof_is_primary = 0;
+static int active_defined;
+
+extern unsigned long oprofile_backtrace_depth;
+
+/* Number of buffers in shared area (one per VCPU) */
+static int nbuf;
+/* Mapping of VIRQ_XENOPROF to irq number */
+static int ovf_irq = -1;
+static cpumask_var_t ovf_irq_mapped;
+/* cpu model type string - copied from Xen on XENOPROF_init command */
+static char cpu_type[XENOPROF_CPU_TYPE_SIZE];
+
+#ifdef CONFIG_PM_SLEEP
+
+static int xenoprof_suspend(void)
+{
+	if (xenoprof_enabled == 1)
+		xenoprof_stop();
+	return 0;
+}
+
+
+static void xenoprof_resume(void)
+{
+	if (xenoprof_enabled == 1)
+		xenoprof_start();
+}
+
+
+static struct syscore_ops oprofile_syscore_ops = {
+	.resume		= xenoprof_resume,
+	.suspend	= xenoprof_suspend
+};
+
+
+static int __init init_driverfs(void)
+{
+	register_syscore_ops(&oprofile_syscore_ops);
+	return 0;
+}
+
+
+static void exit_driverfs(void)
+{
+	unregister_syscore_ops(&oprofile_syscore_ops);
+}
+
+#else
+#define init_driverfs() do { } while (0)
+#define exit_driverfs() do { } while (0)
+#endif /* CONFIG_PM_SLEEP */
+
+static unsigned long long oprofile_samples;
+static unsigned long long p_oprofile_samples;
+
+static unsigned int pdomains;
+static struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS];
+
+/* Check whether the given entry is an escape code */
+static int xenoprof_is_escape(xenoprof_buf_t * buf, int tail)
+{
+#if CONFIG_XEN_COMPAT < 0x040200 && !defined(CONFIG_64BIT)
+	if (buf->event_log[tail].eip == (unsigned long)XENOPROF_ESCAPE_CODE)
+		return 1;
+#endif
+	return (buf->event_log[tail].eip == XENOPROF_ESCAPE_CODE);
+}
+
+/* Get the event at the given entry  */
+static uint8_t xenoprof_get_event(xenoprof_buf_t * buf, int tail)
+{
+	return (buf->event_log[tail].event);
+}
+
+static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive)
+{
+	int head, tail, size;
+	int tracing = 0;
+
+	head = buf->event_head;
+	tail = buf->event_tail;
+	size = buf->event_size;
+
+	while (tail != head) {
+		if (xenoprof_is_escape(buf, tail) &&
+		    xenoprof_get_event(buf, tail) == XENOPROF_TRACE_BEGIN) {
+			tracing=1;
+			oprofile_add_mode(buf->event_log[tail].mode);
+			if (!is_passive)
+				oprofile_samples++;
+			else
+				p_oprofile_samples++;
+			
+		} else {
+			oprofile_add_pc(buf->event_log[tail].eip,
+					buf->event_log[tail].mode,
+					buf->event_log[tail].event);
+			if (!tracing) {
+				if (!is_passive)
+					oprofile_samples++;
+				else
+					p_oprofile_samples++;
+			}
+       
+		}
+		tail++;
+		if(tail==size)
+		    tail=0;
+	}
+	buf->event_tail = tail;
+}
+
+static void xenoprof_handle_passive(void)
+{
+	int i, j;
+	int flag_domain, flag_switch = 0;
+	
+	for (i = 0; i < pdomains; i++) {
+		flag_domain = 0;
+		for (j = 0; j < passive_domains[i].nbuf; j++) {
+			xenoprof_buf_t *buf = p_xenoprof_buf[i][j];
+			if (buf->event_head == buf->event_tail)
+				continue;
+			if (!flag_domain) {
+				if (!oprofile_add_domain_switch(
+					passive_domains[i].domain_id))
+					goto done;
+				flag_domain = 1;
+			}
+			xenoprof_add_pc(buf, 1);
+			flag_switch = 1;
+		}
+	}
+done:
+	if (flag_switch)
+		oprofile_add_domain_switch(COORDINATOR_DOMAIN);
+}
+
+static irqreturn_t xenoprof_ovf_interrupt(int irq, void *dev_id)
+{
+	struct xenoprof_buf * buf;
+	static unsigned long flag;
+
+	buf = xenoprof_buf[smp_processor_id()];
+
+	xenoprof_add_pc(buf, 0);
+
+	if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) {
+		xenoprof_handle_passive();
+		smp_mb__before_clear_bit();
+		clear_bit(0, &flag);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static struct irqaction ovf_action = {
+	.handler = xenoprof_ovf_interrupt,
+	.flags   = IRQF_DISABLED,
+	.name    = "xenoprof"
+};
+
+static void unbind_virq(void)
+{
+	unsigned int i;
+
+	for_each_online_cpu(i) {
+		if (cpumask_test_and_clear_cpu(i, ovf_irq_mapped))
+			unbind_from_per_cpu_irq(ovf_irq, i, &ovf_action);
+	}
+	ovf_irq = -1;
+}
+
+
+static int bind_virq(void)
+{
+	unsigned int i;
+	int result;
+
+	for_each_online_cpu(i) {
+		result = bind_virq_to_irqaction(VIRQ_XENOPROF, i, &ovf_action);
+
+		if (result < 0) {
+			unbind_virq();
+			return result;
+		}
+
+		if (ovf_irq < 0)
+			ovf_irq = result;
+		else if (result != ovf_irq) {
+			unbind_virq();
+			pr_err("IRQ%d unexpected (should be %d)\n",
+			       result, ovf_irq);
+			return -ESTALE;
+		}
+		cpumask_set_cpu(i, ovf_irq_mapped);
+	}
+		
+	return 0;
+}
+
+
+static xenoprof_buf_t **get_buffer_array(unsigned int nbuf)
+{
+	size_t size = nbuf * sizeof(xenoprof_buf_t);
+
+	if (size <= PAGE_SIZE)
+		return kmalloc(size, GFP_KERNEL);
+	return vmalloc(size);
+}
+
+static void release_buffer_array(xenoprof_buf_t **buf, unsigned int nbuf)
+{
+	if (nbuf * sizeof(xenoprof_buf_t) <= PAGE_SIZE)
+		kfree(buf);
+	else
+		vfree(buf);
+}
+
+
+static void unmap_passive_list(void)
+{
+	int i;
+	for (i = 0; i < pdomains; i++) {
+		xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
+		release_buffer_array(p_xenoprof_buf[i],
+				     passive_domains[i].nbuf);
+	}
+	pdomains = 0;
+}
+
+
+static int map_xenoprof_buffer(int max_samples)
+{
+	struct xenoprof_get_buffer get_buffer;
+	struct xenoprof_buf *buf;
+	int ret, i;
+
+	if ( shared_buffer.buffer )
+		return 0;
+
+	get_buffer.max_samples = max_samples;
+	ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer);
+	if (ret)
+		return ret;
+	nbuf = get_buffer.nbuf;
+
+	xenoprof_buf = get_buffer_array(nbuf);
+	if (!xenoprof_buf) {
+		xenoprof_arch_unmap_shared_buffer(&shared_buffer);
+		return -ENOMEM;
+	}
+
+	for (i=0; i< nbuf; i++) {
+		buf = (struct xenoprof_buf*) 
+			&shared_buffer.buffer[i * get_buffer.bufsize];
+		BUG_ON(buf->vcpu_id >= nbuf);
+		xenoprof_buf[buf->vcpu_id] = buf;
+	}
+
+	return 0;
+}
+
+
+static int xenoprof_setup(void)
+{
+	int ret;
+
+	if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) )
+		return ret;
+
+	if ( (ret = bind_virq()) ) {
+		release_buffer_array(xenoprof_buf, nbuf);
+		return ret;
+	}
+
+	if (xenoprof_is_primary) {
+		/* Define dom0 as an active domain if not done yet */
+		if (!active_defined) {
+			domid_t domid;
+			ret = HYPERVISOR_xenoprof_op(
+				XENOPROF_reset_active_list, NULL);
+			if (ret)
+				goto err;
+			domid = 0;
+			ret = HYPERVISOR_xenoprof_op(
+				XENOPROF_set_active, &domid);
+			if (ret)
+				goto err;
+			active_defined = 1;
+		}
+
+		if (oprofile_backtrace_depth > 0) {
+			ret = HYPERVISOR_xenoprof_op(XENOPROF_set_backtrace, 
+						     &oprofile_backtrace_depth);
+			if (ret)
+				oprofile_backtrace_depth = 0;
+		}
+
+		ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
+		if (ret)
+			goto err;
+		
+		xenoprof_arch_counter();
+		ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL);
+		if (ret)
+			goto err;
+	}
+
+	ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL);
+	if (ret)
+		goto err;
+
+	xenoprof_enabled = 1;
+	return 0;
+ err:
+	unbind_virq();
+	release_buffer_array(xenoprof_buf, nbuf);
+	return ret;
+}
+
+
+static void xenoprof_shutdown(void)
+{
+	xenoprof_enabled = 0;
+
+	WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL));
+
+	if (xenoprof_is_primary) {
+		WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_release_counters,
+					       NULL));
+		active_defined = 0;
+	}
+
+	unbind_virq();
+
+	xenoprof_arch_unmap_shared_buffer(&shared_buffer);
+	if (xenoprof_is_primary)
+		unmap_passive_list();
+	release_buffer_array(xenoprof_buf, nbuf);
+}
+
+
+static int xenoprof_start(void)
+{
+	int ret = 0;
+
+	if (xenoprof_is_primary)
+		ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL);
+	if (!ret)
+		xenoprof_arch_start();
+	return ret;
+}
+
+
+static void xenoprof_stop(void)
+{
+	if (xenoprof_is_primary)
+		WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL));
+	xenoprof_arch_stop();
+}
+
+
+static int xenoprof_set_active(int * active_domains,
+			       unsigned int adomains)
+{
+	int ret = 0;
+	int i;
+	int set_dom0 = 0;
+	domid_t domid;
+
+	if (!xenoprof_is_primary)
+		return 0;
+
+	if (adomains > MAX_OPROF_DOMAINS)
+		return -E2BIG;
+
+	ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
+	if (ret)
+		return ret;
+
+	for (i=0; i<adomains; i++) {
+		domid = active_domains[i];
+		if (domid != active_domains[i]) {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
+		if (ret)
+			goto out;
+		if (active_domains[i] == 0)
+			set_dom0 = 1;
+	}
+	/* dom0 must always be active but may not be in the list */ 
+	if (!set_dom0) {
+		domid = 0;
+		ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
+	}
+
+out:
+	if (ret)
+		WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list,
+					       NULL));
+	active_defined = !ret;
+	return ret;
+}
+
+static int xenoprof_set_passive(int * p_domains,
+                                unsigned int pdoms)
+{
+	int ret;
+	unsigned int i, j;
+	struct xenoprof_buf *buf;
+
+	if (!xenoprof_is_primary)
+        	return 0;
+
+	if (pdoms > MAX_OPROF_DOMAINS)
+		return -E2BIG;
+
+	ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL);
+	if (ret)
+		return ret;
+	unmap_passive_list();
+
+	for (i = 0; i < pdoms; i++) {
+		passive_domains[i].domain_id = p_domains[i];
+		passive_domains[i].max_samples = 2048;
+		ret = xenoprof_arch_set_passive(&passive_domains[i],
+						&p_shared_buffer[i]);
+		if (ret)
+			goto out;
+
+		p_xenoprof_buf[i] = get_buffer_array(passive_domains[i].nbuf);
+		if (!p_xenoprof_buf[i]) {
+			++i;
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		for (j = 0; j < passive_domains[i].nbuf; j++) {
+			buf = (struct xenoprof_buf *)
+				&p_shared_buffer[i].buffer[
+				j * passive_domains[i].bufsize];
+			BUG_ON(buf->vcpu_id >= passive_domains[i].nbuf);
+			p_xenoprof_buf[i][buf->vcpu_id] = buf;
+		}
+	}
+
+	pdomains = pdoms;
+	return 0;
+
+out:
+	for (j = 0; j < i; j++) {
+		xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
+		release_buffer_array(p_xenoprof_buf[i],
+				     passive_domains[i].nbuf);
+	}
+
+	return ret;
+}
+
+
+/* The dummy backtrace function to keep oprofile happy
+ * The real backtrace is done in xen
+ */
+static void xenoprof_dummy_backtrace(struct pt_regs * const regs, 
+				     unsigned int depth)
+{
+	/* this should never be called */
+	BUG();
+	return;
+}
+
+
+static struct oprofile_operations xenoprof_ops = {
+#ifdef HAVE_XENOPROF_CREATE_FILES
+	.create_files 	= xenoprof_create_files,
+#endif
+	.set_active	= xenoprof_set_active,
+	.set_passive    = xenoprof_set_passive,
+	.setup 		= xenoprof_setup,
+	.shutdown	= xenoprof_shutdown,
+	.start		= xenoprof_start,
+	.stop		= xenoprof_stop,
+	.backtrace	= xenoprof_dummy_backtrace
+};
+
+
+/* in order to get driverfs right */
+static int using_xenoprof;
+
+int __init xenoprofile_init(struct oprofile_operations * ops)
+{
+	struct xenoprof_init init;
+	int ret;
+
+	if (!zalloc_cpumask_var(&ovf_irq_mapped, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
+	if (!ret) {
+		xenoprof_arch_init_counter(&init);
+		xenoprof_is_primary = init.is_primary;
+
+		/* cpu_type is detected by Xen */
+		strlcpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE);
+		xenoprof_ops.cpu_type = cpu_type;
+
+		init_driverfs();
+		using_xenoprof = 1;
+		*ops = xenoprof_ops;
+
+		active_defined = 0;
+	} else
+		free_cpumask_var(ovf_irq_mapped);
+
+	pr_info("%s: ret %d, events %d, xenoprof_is_primary %d\n",
+		__func__, ret, init.num_events, xenoprof_is_primary);
+	return ret;
+}
+
+
+void xenoprofile_exit(void)
+{
+	if (using_xenoprof)
+		exit_driverfs();
+
+	xenoprof_arch_unmap_shared_buffer(&shared_buffer);
+	if (xenoprof_is_primary) {
+		unmap_passive_list();
+		WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL));
+        }
+
+	free_cpumask_var(ovf_irq_mapped);
+}
diff --git a/fs/Kconfig b/fs/Kconfig
index 433b1ce..3ab289f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -165,6 +165,7 @@ config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
 		   SYS_SUPPORTS_HUGETLBFS || BROKEN
+	depends on !XEN
 	help
 	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
 	  ramfs. For architectures that support it, say Y here and read
diff --git a/fs/aio.c b/fs/aio.c
index 969beb0..b380422 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -39,6 +39,11 @@
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_EPOLL
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#endif
+
 #if DEBUG > 1
 #define dprintk		printk
 #else
@@ -1071,6 +1076,11 @@ put_rq:
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 
+#ifdef CONFIG_EPOLL
+	if (ctx->file && waitqueue_active(&ctx->poll_wait))
+		wake_up(&ctx->poll_wait);
+#endif
+
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 	return ret;
 }
@@ -1079,6 +1089,8 @@ EXPORT_SYMBOL(aio_complete);
 /* aio_read_evt
  *	Pull an event off of the ioctx's event ring.  Returns the number of 
  *	events fetched (0 or 1 ;-)
+ *	If ent parameter is 0, just returns the number of events that would
+ *	be fetched.
  *	FIXME: make this use cmpxchg.
  *	TODO: make the ringbuffer user mmap()able (requires FIXME).
  */
@@ -1101,13 +1113,18 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
 
 	head = ring->head % info->nr;
 	if (head != ring->tail) {
-		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
-		*ent = *evp;
-		head = (head + 1) % info->nr;
-		smp_mb(); /* finish reading the event before updatng the head */
-		ring->head = head;
-		ret = 1;
-		put_aio_ring_event(evp, KM_USER1);
+		if (ent) { /* event requested */
+			struct io_event *evp =
+				aio_ring_event(info, head, KM_USER1);
+			*ent = *evp;
+			head = (head + 1) % info->nr;
+			/* finish reading the event before updatng the head */
+			smp_mb();
+			ring->head = head;
+			ret = 1;
+			put_aio_ring_event(evp, KM_USER1);
+		} else /* only need to know availability */
+			ret = 1;
 	}
 	spin_unlock(&info->ring_lock);
 
@@ -1292,6 +1309,14 @@ static void io_destroy(struct kioctx *ioctx)
 
 	aio_cancel_all(ioctx);
 	wait_for_all_aios(ioctx);
+#ifdef CONFIG_EPOLL
+	/* forget the poll file, but it's up to the user to close it */
+	if (ioctx->file) {
+		fput(ioctx->file);
+		ioctx->file->private_data = 0;
+		ioctx->file = 0;
+	}
+#endif
 
 	/*
 	 * Wake up any waiters.  The setting of ctx->dead must be seen
@@ -1302,6 +1327,70 @@ static void io_destroy(struct kioctx *ioctx)
 	put_ioctx(ioctx);	/* once for the lookup */
 }
 
+#ifdef CONFIG_EPOLL
+
+static int aio_queue_fd_close(struct inode *inode, struct file *file)
+{
+	struct kioctx *ioctx = file->private_data;
+	if (ioctx) {
+		file->private_data = 0;
+		spin_lock_irq(&ioctx->ctx_lock);
+		ioctx->file = 0;
+		spin_unlock_irq(&ioctx->ctx_lock);
+		fput(file);
+	}
+	return 0;
+}
+
+static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
+{	unsigned int pollflags = 0;
+	struct kioctx *ioctx = file->private_data;
+
+	if (ioctx) {
+
+		spin_lock_irq(&ioctx->ctx_lock);
+		/* Insert inside our poll wait queue */
+		poll_wait(file, &ioctx->poll_wait, wait);
+
+		/* Check our condition */
+		if (aio_read_evt(ioctx, 0))
+			pollflags = POLLIN | POLLRDNORM;
+		spin_unlock_irq(&ioctx->ctx_lock);
+	}
+
+	return pollflags;
+}
+
+static const struct file_operations aioq_fops = {
+	.release	= aio_queue_fd_close,
+	.poll		= aio_queue_fd_poll
+};
+
+/* make_aio_fd:
+ *  Create a file descriptor that can be used to poll the event queue.
+ *  Based on the excellent epoll code.
+ */
+
+static int make_aio_fd(struct kioctx *ioctx)
+{
+	int fd;
+	struct file *file;
+
+	fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
+	if (fd < 0)
+		return fd;
+
+	/* associate the file with the IO context */
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+	file->private_data = ioctx;
+	ioctx->file = file;
+	init_waitqueue_head(&ioctx->poll_wait);
+	return fd;
+}
+#endif
+
 /* sys_io_setup:
  *	Create an aio_context capable of receiving at least nr_events.
  *	ctxp must not point to an aio_context that already exists, and
@@ -1314,18 +1403,30 @@ static void io_destroy(struct kioctx *ioctx)
  *	resources are available.  May fail with -EFAULT if an invalid
  *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
  *	implemented.
+ *
+ *	To request a selectable fd, the user context has to be initialized
+ *	to 1, instead of 0, and the return value is the fd.
+ *	This keeps the system call compatible, since a non-zero value
+ *	was not allowed so far.
  */
 SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 {
 	struct kioctx *ioctx = NULL;
 	unsigned long ctx;
 	long ret;
+	int make_fd = 0;
 
 	ret = get_user(ctx, ctxp);
 	if (unlikely(ret))
 		goto out;
 
 	ret = -EINVAL;
+#ifdef CONFIG_EPOLL
+	if (ctx == 1) {
+		make_fd = 1;
+		ctx = 0;
+	}
+#endif
 	if (unlikely(ctx || nr_events == 0)) {
 		pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
 		         ctx, nr_events);
@@ -1336,8 +1437,12 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 	ret = PTR_ERR(ioctx);
 	if (!IS_ERR(ioctx)) {
 		ret = put_user(ioctx->user_id, ctxp);
-		if (!ret)
-			return 0;
+#ifdef CONFIG_EPOLL
+		if (make_fd && ret >= 0)
+			ret = make_aio_fd(ioctx);
+#endif
+		if (ret >= 0)
+			return ret;
 
 		get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
 		io_destroy(ioctx);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0e575d1..f5dd84a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -109,7 +109,7 @@ void invalidate_bdev(struct block_device *bdev)
 	/* 99% of the time, we don't need to flush the cleancache on the bdev.
 	 * But, for the strange corners, lets be cautious
 	 */
-	cleancache_flush_inode(mapping);
+	cleancache_invalidate_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);
 
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a26bea1..3f1d274 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -117,6 +117,13 @@
 #include <asm/fbio.h>
 #endif
 
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#include <xen/public/evtchn.h>
+#include <xen/public/privcmd.h>
+#include <xen/compat_ioctl.h>
+#endif
+
 static int w_long(unsigned int fd, unsigned int cmd,
 		compat_ulong_t __user *argp)
 {
@@ -1415,6 +1422,16 @@ IGNORE_IOCTL(FBIOGETCMAP32)
 IGNORE_IOCTL(FBIOSCURSOR32)
 IGNORE_IOCTL(FBIOGCURSOR32)
 #endif
+
+#ifdef CONFIG_XEN
+COMPATIBLE_IOCTL(IOCTL_PRIVCMD_HYPERCALL)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_VIRQ)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_INTERDOMAIN)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_UNBOUND_PORT)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_UNBIND)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_NOTIFY)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_RESET)
+#endif
 };
 
 /*
@@ -1471,6 +1488,12 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 		return do_video_stillpicture(fd, cmd, argp);
 	case VIDEO_SET_SPU_PALETTE:
 		return do_video_set_spu_palette(fd, cmd, argp);
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+	case IOCTL_PRIVCMD_MMAP_32:
+	case IOCTL_PRIVCMD_MMAPBATCH_32:
+	case IOCTL_PRIVCMD_MMAPBATCH_V2_32:
+		return privcmd_ioctl_32(fd, cmd, argp);
+#endif
 	}
 
 	/*
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d245cb2..263099b 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -130,7 +130,7 @@ static void __kcore_update_ram(struct list_head *list)
 }
 
 
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_XEN)
 /*
  * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
  * because memory hole is not as big as !HIGHMEM case.
@@ -146,7 +146,11 @@ static int kcore_update_ram(void)
 	if (!ent)
 		return -ENOMEM;
 	ent->addr = (unsigned long)__va(0);
+#ifdef CONFIG_HIGHMEM
 	ent->size = max_low_pfn << PAGE_SHIFT;
+#else
+	ent->size = max_pfn << PAGE_SHIFT;
+#endif
 	ent->type = KCORE_RAM;
 	list_add(&ent->list, &head);
 	__kcore_update_ram(&head);
diff --git a/fs/super.c b/fs/super.c
index a820d21..033b8a6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -250,7 +250,7 @@ void deactivate_locked_super(struct super_block *s)
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
-		cleancache_flush_fs(s);
+		cleancache_invalidate_fs(s);
 		fs->kill_sb(s);
 
 		/* caches are now gone, we can safely kill the shrinker now */
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 9d65047..ab7efd2 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -42,6 +42,17 @@
 
 struct acpi_processor_cx;
 
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+struct acpi_csd_package {
+	acpi_integer num_entries;
+	acpi_integer revision;
+	acpi_integer domain;
+	acpi_integer coord_type;
+	acpi_integer num_processors;
+	acpi_integer index;
+} __attribute__ ((packed));
+#endif
+
 struct acpi_power_register {
 	u8 descriptor;
 	u16 length;
@@ -63,18 +74,36 @@ struct acpi_processor_cx {
 	u32 power;
 	u32 usage;
 	u64 time;
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 	u8 bm_sts_skip;
+#else
+	/* Require raw information for external control logic */
+	struct acpi_power_register reg;
+	u32 csd_count;
+	struct acpi_csd_package *domain_info;
+#endif
 	char desc[ACPI_CX_DESC_LEN];
 };
 
 struct acpi_processor_power {
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+	union { /* 'dev' is actually only used for taking its address. */
+#endif
 	struct cpuidle_device dev;
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 	struct acpi_processor_cx *state;
 	unsigned long bm_check_timestamp;
 	u32 default_state;
+#else
+	struct {
+#endif
 	int count;
 	struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
 	int timer_broadcast_on_state;
+#else
+	}; };
+#endif
 };
 
 /* Performance Management */
@@ -290,6 +319,9 @@ static inline void acpi_processor_ppc_exit(void)
 {
 	return;
 }
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+int acpi_processor_ppc_has_changed(struct acpi_processor *, int event_flag);
+#else
 static inline int acpi_processor_ppc_has_changed(struct acpi_processor *pr,
 								int event_flag)
 {
@@ -307,6 +339,7 @@ static inline int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
 {
 	return -ENODEV;
 }
+#endif				/* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
 
 #endif				/* CONFIG_CPU_FREQ */
 
@@ -355,4 +388,119 @@ static inline void acpi_thermal_cpufreq_exit(void)
 }
 #endif
 
+/*
+ * Following are interfaces geared to external processor PM control
+ * logic like a VMM
+ */
+/* Events notified to external control logic */
+#define PROCESSOR_PM_INIT	1
+#define PROCESSOR_PM_CHANGE	2
+#define PROCESSOR_HOTPLUG	3
+
+/* Objects for the PM events */
+#define PM_TYPE_IDLE		0
+#define PM_TYPE_PERF		1
+#define PM_TYPE_THR		2
+#define PM_TYPE_MAX		3
+
+/* Processor hotplug events */
+#define HOTPLUG_TYPE_ADD	0
+#define HOTPLUG_TYPE_REMOVE	1
+
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+struct processor_extcntl_ops {
+	/* Transfer processor PM events to external control logic */
+	int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event);
+	/* Notify physical processor status to external control logic */
+	int (*hotplug)(struct acpi_processor *pr, int type);
+};
+extern const struct processor_extcntl_ops *processor_extcntl_ops;
+
+static inline int processor_cntl_external(void)
+{
+	return (processor_extcntl_ops != NULL);
+}
+
+static inline int processor_pm_external(void)
+{
+	return processor_cntl_external() &&
+		(processor_extcntl_ops->pm_ops[PM_TYPE_IDLE] != NULL);
+}
+
+static inline int processor_pmperf_external(void)
+{
+	return processor_cntl_external() &&
+		(processor_extcntl_ops->pm_ops[PM_TYPE_PERF] != NULL);
+}
+
+static inline int processor_pmthr_external(void)
+{
+	return processor_cntl_external() &&
+		(processor_extcntl_ops->pm_ops[PM_TYPE_THR] != NULL);
+}
+
+extern int processor_notify_external(struct acpi_processor *pr,
+			int event, int type);
+extern int processor_extcntl_prepare(struct acpi_processor *pr);
+extern int acpi_processor_get_performance_info(struct acpi_processor *pr);
+extern int acpi_processor_get_psd(struct acpi_processor *pr);
+#else
+static inline int processor_cntl_external(void) {return 0;}
+static inline int processor_pm_external(void) {return 0;}
+static inline int processor_pmperf_external(void) {return 0;}
+static inline int processor_pmthr_external(void) {return 0;}
+static inline int processor_notify_external(struct acpi_processor *pr,
+			int event, int type)
+{
+	return 0;
+}
+static inline int processor_extcntl_prepare(struct acpi_processor *pr)
+{
+	return 0;
+}
+#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
+
+#ifdef CONFIG_XEN
+static inline void xen_convert_pct_reg(struct xen_pct_register *xpct,
+	struct acpi_pct_register *apct)
+{
+	xpct->descriptor = apct->descriptor;
+	xpct->length     = apct->length;
+	xpct->space_id   = apct->space_id;
+	xpct->bit_width  = apct->bit_width;
+	xpct->bit_offset = apct->bit_offset;
+	xpct->reserved   = apct->reserved;
+	xpct->address    = apct->address;
+}
+
+static inline void xen_convert_pss_states(struct xen_processor_px *xpss,
+	struct acpi_processor_px *apss, int state_count)
+{
+	int i;
+	for(i=0; i<state_count; i++) {
+		xpss->core_frequency     = apss->core_frequency;
+		xpss->power              = apss->power;
+		xpss->transition_latency = apss->transition_latency;
+		xpss->bus_master_latency = apss->bus_master_latency;
+		xpss->control            = apss->control;
+		xpss->status             = apss->status;
+		xpss++;
+		apss++;
+	}
+}
+
+static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd,
+	struct acpi_psd_package *apsd)
+{
+	xpsd->num_entries    = apsd->num_entries;
+	xpsd->revision       = apsd->revision;
+	xpsd->domain         = apsd->domain;
+	xpsd->coord_type     = apsd->coord_type;
+	xpsd->num_processors = apsd->num_processors;
+}
+
+extern int xen_pcpu_hotplug(int type);
+extern int xen_pcpu_index(uint32_t id, bool is_acpiid);
+#endif /* CONFIG_XEN */
+
 #endif
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a733c22..60c2813 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -250,6 +250,8 @@ int acpi_check_region(resource_size_t start, resource_size_t n,
 
 int acpi_resources_are_enforced(void);
 
+int acpi_pci_get_root_seg_bbn(char *hid, char *uid, int *seg, int *bbn);
+
 #ifdef CONFIG_PM_SLEEP
 void __init acpi_no_s4_hw_signature(void);
 void __init acpi_old_suspend_ordering(void);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 2314ad8..4b2724f 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -200,6 +200,12 @@ struct kioctx {
 
 	struct delayed_work	wq;
 
+#ifdef CONFIG_EPOLL
+	/* poll integration */
+	wait_queue_head_t       poll_wait;
+	struct file		*file;
+#endif
+
 	struct rcu_head		rcu_head;
 };
 
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
index 04ffb2e..42e55de 100644
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -28,9 +28,9 @@ struct cleancache_ops {
 			pgoff_t, struct page *);
 	void (*put_page)(int, struct cleancache_filekey,
 			pgoff_t, struct page *);
-	void (*flush_page)(int, struct cleancache_filekey, pgoff_t);
-	void (*flush_inode)(int, struct cleancache_filekey);
-	void (*flush_fs)(int);
+	void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t);
+	void (*invalidate_inode)(int, struct cleancache_filekey);
+	void (*invalidate_fs)(int);
 };
 
 extern struct cleancache_ops
@@ -39,9 +39,9 @@ extern void __cleancache_init_fs(struct super_block *);
 extern void __cleancache_init_shared_fs(char *, struct super_block *);
 extern int  __cleancache_get_page(struct page *);
 extern void __cleancache_put_page(struct page *);
-extern void __cleancache_flush_page(struct address_space *, struct page *);
-extern void __cleancache_flush_inode(struct address_space *);
-extern void __cleancache_flush_fs(struct super_block *);
+extern void __cleancache_invalidate_page(struct address_space *, struct page *);
+extern void __cleancache_invalidate_inode(struct address_space *);
+extern void __cleancache_invalidate_fs(struct super_block *);
 extern int cleancache_enabled;
 
 #ifdef CONFIG_CLEANCACHE
@@ -99,24 +99,24 @@ static inline void cleancache_put_page(struct page *page)
 		__cleancache_put_page(page);
 }
 
-static inline void cleancache_flush_page(struct address_space *mapping,
+static inline void cleancache_invalidate_page(struct address_space *mapping,
 					struct page *page)
 {
 	/* careful... page->mapping is NULL sometimes when this is called */
 	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
-		__cleancache_flush_page(mapping, page);
+		__cleancache_invalidate_page(mapping, page);
 }
 
-static inline void cleancache_flush_inode(struct address_space *mapping)
+static inline void cleancache_invalidate_inode(struct address_space *mapping)
 {
 	if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping))
-		__cleancache_flush_inode(mapping);
+		__cleancache_invalidate_inode(mapping);
 }
 
-static inline void cleancache_flush_fs(struct super_block *sb)
+static inline void cleancache_invalidate_fs(struct super_block *sb)
 {
 	if (cleancache_enabled)
-		__cleancache_flush_fs(sb);
+		__cleancache_invalidate_fs(sb);
 }
 
 #endif /* _LINUX_CLEANCACHE_H */
diff --git a/include/linux/console.h b/include/linux/console.h
index 7201ce4..670afb7 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -73,6 +73,7 @@ extern const struct consw dummy_con;	/* dummy console buffer */
 extern const struct consw vga_con;	/* VGA text console */
 extern const struct consw newport_con;	/* SGI Newport console  */
 extern const struct consw prom_con;	/* SPARC PROM console */
+extern bool console_use_vt;
 
 int con_is_bound(const struct consw *csw);
 int register_con_driver(const struct consw *csw, int first, int last);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 6216115..c913d51 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -322,7 +322,7 @@ static inline unsigned int cpufreq_get(unsigned int cpu)
 #endif
 
 /* query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it */
-#ifdef CONFIG_CPU_FREQ
+#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
 unsigned int cpufreq_quick_get(unsigned int cpu);
 unsigned int cpufreq_quick_get_max(unsigned int cpu);
 #else
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 37c3007..16b9cc6 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -408,7 +408,9 @@ typedef struct {
  * All runtime access to EFI goes through this structure:
  */
 extern struct efi {
+#ifndef CONFIG_XEN
 	efi_system_table_t *systab;	/* EFI system table */
+#endif
 	unsigned int runtime_version;	/* Runtime services version */
 	unsigned long mps;		/* MPS table */
 	unsigned long acpi;		/* ACPI table  (IA64 ext 0.71) */
@@ -430,8 +432,10 @@ extern struct efi {
 	efi_update_capsule_t *update_capsule;
 	efi_query_capsule_caps_t *query_capsule_caps;
 	efi_get_next_high_mono_count_t *get_next_high_mono_count;
+#ifndef CONFIG_XEN
 	efi_reset_system_t *reset_system;
 	efi_set_virtual_address_map_t *set_virtual_address_map;
+#endif
 } efi;
 
 static inline int
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
index 278e3ef..9f9816a 100644
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -52,7 +52,7 @@
 4484:.balign 4				;	\
 .popsection				;
 
-#define ELFNOTE(name, type, desc)		\
+#define ELFNOTE(name, type, desc...)		\
 	ELFNOTE_START(name, type, "")		\
 		desc			;	\
 	ELFNOTE_END
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
new file mode 100644
index 0000000..3e46c31
--- /dev/null
+++ b/include/linux/frontswap.h
@@ -0,0 +1,126 @@
+#ifndef _LINUX_FRONTSWAP_H
+#define _LINUX_FRONTSWAP_H
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/bitops.h>
+
+struct frontswap_ops {
+	void (*init)(unsigned);
+	int (*put_page)(unsigned, pgoff_t, struct page *);
+	int (*get_page)(unsigned, pgoff_t, struct page *);
+	void (*invalidate_page)(unsigned, pgoff_t);
+	void (*invalidate_area)(unsigned);
+};
+
+extern int frontswap_enabled;
+extern struct frontswap_ops
+	frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_shrink(unsigned long);
+extern unsigned long frontswap_curr_pages(void);
+
+extern void __frontswap_init(unsigned type);
+extern int __frontswap_put_page(struct page *page);
+extern int __frontswap_get_page(struct page *page);
+extern void __frontswap_invalidate_page(unsigned, pgoff_t);
+extern void __frontswap_invalidate_area(unsigned);
+
+#ifdef CONFIG_FRONTSWAP
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+	int ret = 0;
+
+	if (frontswap_enabled && sis->frontswap_map)
+		ret = test_bit(offset, sis->frontswap_map);
+	return ret;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		set_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+	if (frontswap_enabled && sis->frontswap_map)
+		clear_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+				     unsigned long *map)
+{
+	p->frontswap_map = map;
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+	return p->frontswap_map;
+}
+#else
+/* all inline routines become no-ops and all externs are ignored */
+
+#define frontswap_enabled (0)
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+	return 0;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+				     unsigned long *map)
+{
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+	return NULL;
+}
+#endif
+
+static inline int frontswap_put_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_put_page(page);
+	return ret;
+}
+
+static inline int frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+
+	if (frontswap_enabled)
+		ret = __frontswap_get_page(page);
+	return ret;
+}
+
+static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+	if (frontswap_enabled)
+		__frontswap_invalidate_page(type, offset);
+}
+
+static inline void frontswap_invalidate_area(unsigned type)
+{
+	if (frontswap_enabled)
+		__frontswap_invalidate_area(type);
+}
+
+static inline void frontswap_init(unsigned type)
+{
+	if (frontswap_enabled)
+		__frontswap_init(type);
+}
+
+#endif /* _LINUX_FRONTSWAP_H */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 3a93f73..aac1d61 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -178,12 +178,14 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
 	return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
 }
 
+#ifndef __HAVE_ARCH_CLEAR_HIGHPAGE
 static inline void clear_highpage(struct page *page)
 {
 	void *kaddr = kmap_atomic(page, KM_USER0);
 	clear_page(kaddr);
 	kunmap_atomic(kaddr, KM_USER0);
 }
+#endif
 
 static inline void zero_user_segments(struct page *page,
 	unsigned start1, unsigned end1,
@@ -237,6 +239,8 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
 
 #endif
 
+#ifndef __HAVE_ARCH_COPY_HIGHPAGE
+
 static inline void copy_highpage(struct page *to, struct page *from)
 {
 	char *vfrom, *vto;
@@ -248,4 +252,6 @@ static inline void copy_highpage(struct page *to, struct page *from)
 	kunmap_atomic(vfrom, KM_USER0);
 }
 
+#endif
+
 #endif /* _LINUX_HIGHMEM_H */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a64b00e..26d4d4c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -394,6 +394,11 @@ static inline int disable_irq_wake(unsigned int irq)
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
+#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED
+int irq_ignore_unhandled(unsigned int irq);
+#else
+#define irq_ignore_unhandled(irq) 0
+#endif
 
 #ifdef CONFIG_IRQ_FORCED_THREADING
 extern bool force_irqthreads;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0d7d6a1..a296b88 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -56,6 +56,13 @@
 			    KEXEC_CORE_NOTE_DESC_BYTES )
 #endif
 
+#ifndef KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
 /*
  * This structure is used to hold the arguments that are used when loading
  * kernel binaries.
@@ -122,6 +129,12 @@ struct kimage {
 extern void machine_kexec(struct kimage *image);
 extern int machine_kexec_prepare(struct kimage *image);
 extern void machine_kexec_cleanup(struct kimage *image);
+#ifdef CONFIG_XEN
+extern int xen_machine_kexec_load(struct kimage *image);
+extern void xen_machine_kexec_unload(struct kimage *image);
+extern void xen_machine_kexec_setup_resources(void);
+extern void xen_machine_kexec_register_resources(struct resource *res);
+#endif
 extern asmlinkage long sys_kexec_load(unsigned long entry,
 					unsigned long nr_segments,
 					struct kexec_segment __user *segments,
@@ -204,8 +217,15 @@ extern struct kimage *kexec_crash_image;
 #define VMCOREINFO_BYTES           (4096)
 #define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
 #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#if !defined(CONFIG_XEN) || !defined(CONFIG_X86)
 #define VMCOREINFO_NOTE_SIZE       (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \
 				    + VMCOREINFO_NOTE_NAME_BYTES)
+#else
+#define VMCOREINFO_NOTE_SIZE       ALIGN(KEXEC_NOTE_HEAD_BYTES*2 \
+					 + VMCOREINFO_BYTES \
+					 + VMCOREINFO_NOTE_NAME_BYTES, \
+					 PAGE_SIZE)
+#endif
 
 /* Location of a reserved region to hold the crash kernel.
  */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dbc63ff..75f655c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -115,7 +115,12 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_CAN_NONLINEAR 0x08000000	/* Has ->fault & does nonlinear pages */
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
+#ifndef CONFIG_XEN
 #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
+#else
+#define VM_SAO		0
+#define VM_FOREIGN	0x20000000	/* Has pages belonging to another VM */
+#endif
 #define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
 #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
 
@@ -144,6 +149,12 @@ extern unsigned int kobjsize(const void *objp);
  */
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 
+#ifdef CONFIG_XEN
+struct vm_foreign_map {
+	struct page **map;
+};
+#endif
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
@@ -215,6 +226,17 @@ struct vm_operations_struct {
 	 */
 	int (*access)(struct vm_area_struct *vma, unsigned long addr,
 		      void *buf, int len, int write);
+
+#ifdef CONFIG_XEN
+	/* Area-specific function for clearing the PTE at @ptep. Returns the
+	 * original value of @ptep. */
+	pte_t (*zap_pte)(struct vm_area_struct *vma,
+			 unsigned long addr, pte_t *ptep, int is_fullmm);
+
+	/* called before close() to indicate no more pages should be mapped */
+	void (*unmap)(struct vm_area_struct *area);
+#endif
+
 #ifdef CONFIG_NUMA
 	/*
 	 * set_policy() op must add a reference to any non-NULL @new mempolicy
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 2d304ef..98483e5 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -18,6 +18,9 @@
 #include <asm/nmi.h>
 extern void touch_nmi_watchdog(void);
 #else
+#ifdef CONFIG_XEN
+#include <asm/nmi.h>
+#endif
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index a4c5624..7a33436 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -19,6 +19,9 @@
 #include <linux/errno.h>
 #include <linux/printk.h>
 #include <linux/atomic.h>
+#ifdef CONFIG_XEN
+#include <xen/interface/xenoprof.h>
+#endif
  
 /* Each escaped entry is prefixed by ESCAPE_CODE
  * then one of the following codes, then the
@@ -31,14 +34,18 @@
 #define CPU_SWITCH_CODE			2
 #define COOKIE_SWITCH_CODE		3
 #define KERNEL_ENTER_SWITCH_CODE	4
-#define KERNEL_EXIT_SWITCH_CODE		5
+#define USER_ENTER_SWITCH_CODE		5
 #define MODULE_LOADED_CODE		6
 #define CTX_TGID_CODE			7
 #define TRACE_BEGIN_CODE		8
 #define TRACE_END_CODE			9
 #define XEN_ENTER_SWITCH_CODE		10
+#ifndef CONFIG_XEN
 #define SPU_PROFILING_CODE		11
 #define SPU_CTX_SWITCH_CODE		12
+#else
+#define DOMAIN_SWITCH_CODE		11
+#endif
 #define IBS_FETCH_CODE			13
 #define IBS_OP_CODE			14
 
@@ -52,6 +59,12 @@ struct oprofile_operations {
 	/* create any necessary configuration files in the oprofile fs.
 	 * Optional. */
 	int (*create_files)(struct super_block * sb, struct dentry * root);
+#ifdef CONFIG_XEN
+	/* setup active domains with Xen */
+	int (*set_active)(int *active_domains, unsigned int adomains);
+	/* setup passive domains with Xen */
+	int (*set_passive)(int *passive_domains, unsigned int pdomains);
+#endif
 	/* Do any necessary interrupt setup. Optional. */
 	int (*setup)(void);
 	/* Do any necessary interrupt shutdown. Optional. */
@@ -117,9 +130,14 @@ void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs,
  * backtrace. */
 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event);
 
+void oprofile_add_mode(int cpu_mode);
+
 /* add a backtrace entry, to be called from the ->backtrace callback */
 void oprofile_add_trace(unsigned long eip);
 
+/* add a domain switch entry */
+int oprofile_add_domain_switch(int32_t domain_id);
+
 
 /**
  * Create a file of the given name as a child of the given root, with
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e90a673..a3bebfd3 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -107,6 +107,11 @@ enum pageflags {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
 #endif
+#ifdef CONFIG_XEN
+	PG_foreign,		/* Page is owned by foreign allocator. */
+	/* PG_netback,		   Page is owned by netback */
+	PG_blkback,		/* Page is owned by blkback */
+#endif
 	__NR_PAGEFLAGS,
 
 	/* Filesystems */
@@ -119,8 +124,15 @@ enum pageflags {
 	PG_fscache = PG_private_2,	/* page backed by cache */
 
 	/* XEN */
+#if defined(CONFIG_XEN)
+	PG_pinned = PG_locked,	/* Cannot alias with PG_owner_priv_1 since
+				 * bad_page() checks should include this bit.
+				 * Should not use PG_arch_1 as that may have
+				 * a different purpose elsewhere. */
+#elif defined(CONFIG_PARAVIRT_XEN)
 	PG_pinned = PG_owner_priv_1,
 	PG_savepinned = PG_dirty,
+#endif
 
 	/* SLOB */
 	PG_slob_free = PG_private,
@@ -202,8 +214,12 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
 	TESTCLEARFLAG(Active, active)
 __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
+#endif
+#ifdef CONFIG_PARAVIRT_XEN
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
+#endif
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
@@ -320,6 +336,28 @@ static inline void SetPageUptodate(struct page *page)
 
 CLEARPAGEFLAG(Uptodate, uptodate)
 
+#ifdef CONFIG_XEN
+TESTPAGEFLAG(Foreign, foreign)
+static inline void SetPageForeign(struct page *page,
+				  void (*dtor)(struct page *, unsigned int))
+{
+	BUG_ON(!dtor);
+	set_bit(PG_foreign, &page->flags);
+	page->index = (long)dtor;
+}
+static inline void ClearPageForeign(struct page *page)
+{
+	clear_bit(PG_foreign, &page->flags);
+	page->index = 0;
+}
+static inline void PageForeignDestructor(struct page *page, unsigned int order)
+{
+	((void (*)(struct page *, unsigned int))page->index)(page, order);
+}
+/*PAGEFLAG(Netback, netback)*/
+PAGEFLAG(Blkback, blkback)
+#endif
+
 extern void cancel_dirty_page(struct page *page, unsigned int account_size);
 
 int test_clear_page_writeback(struct page *page);
@@ -444,6 +482,12 @@ static inline int PageTransCompound(struct page *page)
 #define __PG_COMPOUND_LOCK		0
 #endif
 
+#ifndef CONFIG_XEN
+# define __PG_XEN		0
+#else
+# define __PG_XEN		(1 << PG_foreign)
+#endif
+
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
  * these flags set.  It they are, there is a problem.
@@ -454,7 +498,7 @@ static inline int PageTransCompound(struct page *page)
 	 1 << PG_writeback | 1 << PG_reserved | \
 	 1 << PG_slab	 | 1 << PG_swapcache | 1 << PG_active | \
 	 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
-	 __PG_COMPOUND_LOCK)
+	 __PG_COMPOUND_LOCK | __PG_XEN)
 
 /*
  * Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a16b1df..b6906cb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -334,7 +334,7 @@ struct pci_dev {
 	int rom_attr_enabled;		/* has display of the rom attribute been enabled? */
 	struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
 	struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
-#ifdef CONFIG_PCI_MSI
+#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
 	struct list_head msi_list;
 	struct kset *msi_kset;
 #endif
@@ -822,6 +822,9 @@ void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
+#ifdef CONFIG_XEN
+void pci_restore_bars(struct pci_dev *);
+#endif
 
 /* ROM control related routines */
 int pci_enable_rom(struct pci_dev *pdev);
@@ -1044,6 +1047,10 @@ extern void pci_disable_msix(struct pci_dev *dev);
 extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
 extern void pci_restore_msi_state(struct pci_dev *dev);
 extern int pci_msi_enabled(void);
+#ifdef CONFIG_XEN
+extern int register_msi_get_owner(int (*func)(struct pci_dev *dev));
+extern int unregister_msi_get_owner(int (*func)(struct pci_dev *dev));
+#endif
 #endif
 
 #ifdef CONFIG_PCIEPORTBUS
@@ -1658,5 +1665,11 @@ static inline void pci_release_bus_of_node(struct pci_bus *bus) { }
  */
 struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
 
+#ifdef CONFIG_PCI_GUESTDEV
+int pci_is_guestdev(struct pci_dev *dev);
+#else
+#define pci_is_guestdev(dev)	0
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* LINUX_PCI_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3e60228..23ebc9b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -194,6 +194,10 @@ struct swap_info_struct {
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	unsigned int old_block_size;	/* seldom referenced */
+#ifdef CONFIG_FRONTSWAP
+	unsigned long *frontswap_map;	/* frontswap in-use, one bit per page */
+	atomic_t frontswap_pages;	/* frontswap pages in-use counter */
+#endif
 };
 
 struct swap_list_t {
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
new file mode 100644
index 0000000..e282624
--- /dev/null
+++ b/include/linux/swapfile.h
@@ -0,0 +1,13 @@
+#ifndef _LINUX_SWAPFILE_H
+#define _LINUX_SWAPFILE_H
+
+/*
+ * these were static in swapfile.c but frontswap.c needs them and we don't
+ * want to expose them to the dozens of source files that include swap.h
+ */
+extern spinlock_t swap_lock;
+extern struct swap_list_t swap_list;
+extern struct swap_info_struct *swap_info[];
+extern int try_to_unuse(unsigned int, bool, unsigned long);
+
+#endif /* _LINUX_SWAPFILE_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index bb9127d..d8b603b 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -59,6 +59,7 @@ enum
 	CTL_BUS=8,		/* Busses */
 	CTL_ABI=9,		/* Binary emulation */
 	CTL_CPU=10,		/* CPU stuff (speed scaling, etc) */
+	CTL_XEN=123,		/* Xen info and control */
 	CTL_ARLAN=254,		/* arlan wireless driver */
 	CTL_S390DBF=5677,	/* s390 debug */
 	CTL_SUNRPC=7249,	/* sunrpc debug */
diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h
index 6f8fbcf..faacdbc 100644
--- a/include/linux/vermagic.h
+++ b/include/linux/vermagic.h
@@ -21,6 +21,11 @@
 #else
 #define MODULE_VERMAGIC_MODVERSIONS ""
 #endif
+#ifdef CONFIG_XEN
+#define MODULE_VERMAGIC_XEN "Xen "
+#else
+#define MODULE_VERMAGIC_XEN
+#endif
 #ifndef MODULE_ARCH_VERMAGIC
 #define MODULE_ARCH_VERMAGIC ""
 #endif
@@ -29,5 +34,5 @@
 	UTS_RELEASE " "							\
 	MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT 			\
 	MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS	\
-	MODULE_ARCH_VERMAGIC
+	MODULE_VERMAGIC_XEN MODULE_ARCH_VERMAGIC
 
diff --git a/include/xen/Kbuild b/include/xen/Kbuild
index 84ad8f0..e4a826b 100644
--- a/include/xen/Kbuild
+++ b/include/xen/Kbuild
@@ -1,2 +1 @@
-header-y += evtchn.h
-header-y += privcmd.h
+header-y += public/
diff --git a/include/xen/balloon.h b/include/xen/balloon.h
index cc2e1a7..9958b0c 100644
--- a/include/xen/balloon.h
+++ b/include/xen/balloon.h
@@ -1,7 +1,68 @@
 /******************************************************************************
- * Xen balloon functionality
+ * balloon.h
+ *
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
  */
 
+#ifndef __XEN_BALLOON_H__
+#define __XEN_BALLOON_H__
+
+#include <linux/spinlock.h>
+
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/*
+ * Inform the balloon driver that it should allow some slop for device-driver
+ * memory activities.
+ */
+void balloon_update_driver_allowance(long delta);
+
+/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
+struct page **alloc_empty_pages_and_pagevec(int nr_pages);
+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
+
+/* Free an empty page range (not allocated through
+   alloc_empty_pages_and_pagevec), adding to the balloon. */
+void free_empty_pages(struct page **pagevec, int nr_pages);
+
+void balloon_release_driver_page(struct page *page);
+
+/*
+ * Prevent the balloon driver from changing the memory reservation during
+ * a driver critical region.
+ */
+extern spinlock_t balloon_lock;
+#define balloon_lock(__flags)   spin_lock_irqsave(&balloon_lock, __flags)
+#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
+
+#else /* CONFIG_PARAVIRT_XEN */
+
 #define RETRY_UNLIMITED	0
 
 struct balloon_stats {
@@ -29,6 +90,8 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages,
 		bool highmem);
 void free_xenballooned_pages(int nr_pages, struct page **pages);
 
+#endif /* CONFIG_PARAVIRT_XEN */
+
 struct device;
 #ifdef CONFIG_XEN_SELFBALLOONING
 extern int register_xen_selfballooning(struct device *dev);
@@ -38,3 +101,5 @@ static inline int register_xen_selfballooning(struct device *dev)
 	return -ENOSYS;
 }
 #endif
+
+#endif /* __XEN_BALLOON_H__ */
diff --git a/include/xen/blkif.h b/include/xen/blkif.h
new file mode 100644
index 0000000..b6cc8b0
--- /dev/null
+++ b/include/xen/blkif.h
@@ -0,0 +1,159 @@
+/* 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/* Not a real protocol.  Used to generate ring structs which contain
+ * the elements common to all protocols only.  This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places.  */
+struct blkif_common_request {
+	char dummy;
+};
+struct blkif_common_response {
+	char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_32_discard {
+	uint8_t        operation;    /* BLKIF_OP_DISCARD                     */
+	uint8_t        flag;         /* BLKIF_DISCARD_*                      */
+	blkif_vdev_t   handle;       /* same as for read/write requests      */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk             */
+	uint64_t       nr_sectors;   /* number of contiguous sectors         */
+};
+struct blkif_x86_32_response {
+	uint64_t        id;              /* copied from request */
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_discard blkif_x86_32_discard_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       __attribute__((__aligned__(8))) id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_64_discard {
+	uint8_t        operation;    /* BLKIF_OP_DISCARD                     */
+	uint8_t        flag;         /* BLKIF_DISCARD_*                      */
+	blkif_vdev_t   handle;       /* sane as for read/write requests      */
+	uint64_t       __attribute__((__aligned__(8))) id;
+	blkif_sector_t sector_number;/* start sector idx on disk             */
+	uint64_t       nr_sectors;   /* number of contiguous sectors         */
+};
+struct blkif_x86_64_response {
+	uint64_t       __attribute__((__aligned__(8))) id;
+	uint8_t         operation;       /* copied from request */
+	int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_discard blkif_x86_64_discard_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+
+union blkif_back_rings {
+	blkif_back_ring_t        native;
+	blkif_common_back_ring_t common;
+	blkif_x86_32_back_ring_t x86_32;
+	blkif_x86_64_back_ring_t x86_64;
+};
+typedef union blkif_back_rings blkif_back_rings_t;
+
+enum blkif_protocol {
+	BLKIF_PROTOCOL_NATIVE = 1,
+	BLKIF_PROTOCOL_X86_32 = 2,
+	BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->sector_number = src->sector_number;
+	barrier();
+	if (unlikely(dst->operation == BLKIF_OP_DISCARD)) {
+		blkif_request_discard_t *d = (void *)dst;
+		const blkif_x86_32_discard_t *s = (const void *)src;
+
+		/* We should be doing "d->flag = s->flag;" but the
+		 * copying of nr_segments does it for us already. */
+		d->nr_sectors = s->nr_sectors;
+		return;
+	}
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	dst->operation = src->operation;
+	dst->nr_segments = src->nr_segments;
+	dst->handle = src->handle;
+	dst->id = src->id;
+	dst->sector_number = src->sector_number;
+	barrier();
+	if (unlikely(dst->operation == BLKIF_OP_DISCARD)) {
+		blkif_request_discard_t *d = (void *)dst;
+		const blkif_x86_64_discard_t *s = (const void *)src;
+
+		/* We should be doing "d->flag = s->flag" but the
+		 * copying of nr_segments does it for us already. */
+		d->nr_sectors = s->nr_sectors;
+		return;
+	}
+	if (n > dst->nr_segments)
+		n = dst->nr_segments;
+	for (i = 0; i < n; i++)
+		dst->seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */
diff --git a/include/xen/clock.h b/include/xen/clock.h
new file mode 100644
index 0000000..e45552f
--- /dev/null
+++ b/include/xen/clock.h
@@ -0,0 +1,18 @@
+#ifndef __XEN_CPU_CLOCK_H__
+#define __XEN_CPU_CLOCK_H__
+
+void setup_runstate_area(unsigned int cpu);
+
+unsigned long long xen_local_clock(void);
+void xen_check_wallclock_update(void);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+void xen_clockevents_init(void);
+void xen_setup_cpu_clockevents(void);
+void xen_clockevents_resume(void);
+#else
+static inline void xen_setup_cpu_clockevents(void) {}
+static inline void xen_clockevents_resume(void) {}
+#endif
+
+#endif /* __XEN_CPU_CLOCK_H__ */
diff --git a/include/xen/compat_ioctl.h b/include/xen/compat_ioctl.h
new file mode 100644
index 0000000..975afb6
--- /dev/null
+++ b/include/xen/compat_ioctl.h
@@ -0,0 +1,75 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
+ *          Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __LINUX_XEN_COMPAT_H__ 
+#define __LINUX_XEN_COMPAT_H__ 
+
+#include <linux/compat.h>
+#include <linux/compiler.h>
+
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+#define xen_pfn32_t __u32
+#endif
+
+extern int privcmd_ioctl_32(int fd, unsigned int cmd, void __user *arg);
+struct privcmd_mmap_32 {
+	int num;
+	domid_t dom;
+	compat_uptr_t entry;
+};
+
+struct privcmd_mmapbatch_32 {
+	int num;     /* number of pages to populate */
+	domid_t dom; /* target domain */
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+	union {      /* virtual address */
+		__u64 addr __attribute__((packed));
+		__u32 va; /* ensures union is 4-byte aligned */
+	};
+#else
+	__u64 addr;  /* virtual address */
+#endif
+	compat_uptr_t arr; /* array of mfns - top nibble set on err */
+};
+
+struct privcmd_mmapbatch_v2_32 {
+	unsigned int num; /* number of pages to populate */
+	domid_t dom;      /* target domain */
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+	union {      /* virtual address */
+		__u64 addr __attribute__((packed));
+		__u32 va; /* ensures union is 4-byte aligned */
+	};
+#else
+	__u64 addr;  /* virtual address */
+#endif
+	compat_uptr_t arr; /* array of mfns */
+	compat_uptr_t err; /* array of error codes */
+};
+
+#define IOCTL_PRIVCMD_MMAP_32                   \
+	_IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32))
+#define IOCTL_PRIVCMD_MMAPBATCH_32              \
+	_IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32))
+#define IOCTL_PRIVCMD_MMAPBATCH_V2_32           \
+	_IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2_32))
+
+#endif /* __LINUX_XEN_COMPAT_H__ */
diff --git a/include/xen/cpu_hotplug.h b/include/xen/cpu_hotplug.h
new file mode 100644
index 0000000..9c0f5b8
--- /dev/null
+++ b/include/xen/cpu_hotplug.h
@@ -0,0 +1,39 @@
+#ifndef __XEN_CPU_HOTPLUG_H__
+#define __XEN_CPU_HOTPLUG_H__
+
+#include <linux/kernel.h>
+#include <linux/cpumask.h>
+
+#if defined(CONFIG_X86) && defined(CONFIG_SMP)
+extern cpumask_var_t vcpu_initialized_mask;
+#endif
+
+#if defined(CONFIG_HOTPLUG_CPU)
+
+int cpu_up_check(unsigned int cpu);
+void init_xenbus_allowed_cpumask(void);
+int smp_suspend(void);
+void smp_resume(void);
+
+#else /* !defined(CONFIG_HOTPLUG_CPU) */
+
+#define cpu_up_check(cpu)		(0)
+#define init_xenbus_allowed_cpumask()	((void)0)
+
+static inline int smp_suspend(void)
+{
+	if (num_online_cpus() > 1) {
+		pr_warning("Can't suspend SMP guests without"
+			   " CONFIG_HOTPLUG_CPU\n");
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static inline void smp_resume(void)
+{
+}
+
+#endif /* !defined(CONFIG_HOTPLUG_CPU) */
+
+#endif /* __XEN_CPU_HOTPLUG_H__ */
diff --git a/include/xen/driver_util.h b/include/xen/driver_util.h
new file mode 100644
index 0000000..12d10f7
--- /dev/null
+++ b/include/xen/driver_util.h
@@ -0,0 +1,14 @@
+#ifndef __XEN_DRIVER_UTIL_H__
+#define __XEN_DRIVER_UTIL_H__
+
+#include <linux/compiler.h>
+#include <linux/device.h>
+
+extern struct class *get_xen_class(void);
+extern struct device *xen_class_device_create(struct device_type *,
+					      struct device *parent,
+					      dev_t devt, void *drvdata,
+					      const char *fmt, ...)
+		      __printf(5, 6);
+
+#endif /* __XEN_DRIVER_UTIL_H__ */
diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h
index 14e833e..2105a18 100644
--- a/include/xen/evtchn.h
+++ b/include/xen/evtchn.h
@@ -1,7 +1,11 @@
+#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__)
+#include "public/evtchn.h"
+#else
 /******************************************************************************
  * evtchn.h
  *
- * Interface to /dev/xen/evtchn.
+ * Communication via Xen event channels.
+ * Also definitions for the device that demuxes notifications to userspace.
  *
  * Copyright (c) 2003-2005, K A Fraser
  *
@@ -30,59 +34,193 @@
  * IN THE SOFTWARE.
  */
 
-#ifndef __LINUX_PUBLIC_EVTCHN_H__
-#define __LINUX_PUBLIC_EVTCHN_H__
+#ifndef __ASM_EVTCHN_H__
+#define __ASM_EVTCHN_H__
 
-/*
- * Bind a fresh port to VIRQ @virq.
- * Return allocated port.
- */
-#define IOCTL_EVTCHN_BIND_VIRQ				\
-	_IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
-struct ioctl_evtchn_bind_virq {
-	unsigned int virq;
-};
+#include <linux/interrupt.h>
+#include <asm/hypervisor.h>
+#include <asm/ptrace.h>
+#include <asm/sync_bitops.h>
+#include <xen/interface/event_channel.h>
+#include <linux/smp.h>
 
 /*
- * Bind a fresh port to remote <@remote_domain, @remote_port>.
- * Return allocated port.
+ * LOW-LEVEL DEFINITIONS
  */
-#define IOCTL_EVTCHN_BIND_INTERDOMAIN			\
-	_IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
-struct ioctl_evtchn_bind_interdomain {
-	unsigned int remote_domain, remote_port;
+
+#ifdef CONFIG_XEN
+struct irq_cfg {
+	u32 info;
+	union {
+		int bindcount; /* for dynamic IRQs */
+#ifdef CONFIG_X86_IO_APIC
+		u8 vector; /* for physical IRQs */
+#endif
+	};
 };
+struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
+static inline int evtchn_make_refcounted(unsigned int evtchn) { return 0; }
+#endif
 
 /*
- * Allocate a fresh port for binding to @remote_domain.
- * Return allocated port.
+ * Dynamically bind an event source to an IRQ-like callback handler.
+ * On some platforms this may not be implemented via the Linux IRQ subsystem.
+ * The IRQ argument passed to the callback handler is the same as returned
+ * from the bind call. It may not correspond to a Linux IRQ number.
+ * Returns IRQ or negative errno.
  */
-#define IOCTL_EVTCHN_BIND_UNBOUND_PORT			\
-	_IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
-struct ioctl_evtchn_bind_unbound_port {
-	unsigned int remote_domain;
-};
+int bind_caller_port_to_irqhandler(
+	unsigned int caller_port,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id);
+int bind_listening_port_to_irqhandler(
+	unsigned int remote_domain,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id);
+int bind_interdomain_evtchn_to_irqhandler(
+	unsigned int remote_domain,
+	unsigned int remote_port,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id);
+int bind_virq_to_irqhandler(
+	unsigned int virq,
+	unsigned int cpu,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id);
+#if defined(CONFIG_SMP) && defined(CONFIG_XEN) && defined(CONFIG_X86)
+int bind_virq_to_irqaction(
+	unsigned int virq,
+	unsigned int cpu,
+	struct irqaction *action);
+#else
+#define bind_virq_to_irqaction(virq, cpu, action) \
+	bind_virq_to_irqhandler(virq, cpu, (action)->handler, \
+			 	(action)->flags | IRQF_NOBALANCING, \
+				(action)->name, action)
+#endif
+#if defined(CONFIG_SMP) && !defined(MODULE)
+#ifndef CONFIG_X86
+int bind_ipi_to_irqhandler(
+	unsigned int ipi,
+	unsigned int cpu,
+	irq_handler_t handler,
+	unsigned long irqflags,
+	const char *devname,
+	void *dev_id);
+#else
+int bind_ipi_to_irqaction(
+	unsigned int cpu,
+	struct irqaction *action);
+DECLARE_PER_CPU(DECLARE_BITMAP(, NR_IPIS), ipi_pending);
+#endif
+#endif
 
 /*
- * Unbind previously allocated @port.
+ * Common unbind function for all event sources. Takes IRQ to unbind from.
+ * Automatically closes the underlying event channel (except for bindings
+ * made with bind_caller_port_to_irqhandler()).
  */
-#define IOCTL_EVTCHN_UNBIND				\
-	_IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
-struct ioctl_evtchn_unbind {
-	unsigned int port;
-};
+void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_XEN) && defined(CONFIG_X86)
+/* Specialized unbind function for per-CPU IRQs. */
+void unbind_from_per_cpu_irq(unsigned int irq, unsigned int cpu,
+			     struct irqaction *);
+#else
+#define unbind_from_per_cpu_irq(irq, cpu, action) \
+	unbind_from_irqhandler(irq, action)
+#endif
+
+#ifndef CONFIG_XEN
+void irq_resume(void);
+#endif
+
+/* Entry point for notifications into Linux subsystems. */
+asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
+
+/* Mark a PIRQ as unavailable for dynamic allocation. */
+void evtchn_register_pirq(int irq);
+/* Map a Xen-supplied PIRQ to a dynamically allocated one. */
+int evtchn_map_pirq(int irq, int xen_pirq);
+/* Look up a Xen-supplied PIRQ for a dynamically allocated one. */
+int evtchn_get_xen_pirq(int irq);
+
+void mask_evtchn(int port);
+void disable_all_local_evtchn(void);
+void unmask_evtchn(int port);
+unsigned int irq_from_evtchn(unsigned int port);
+
+static inline int test_and_set_evtchn_mask(int port)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	return sync_test_and_set_bit(port, s->evtchn_mask);
+}
+
+static inline void clear_evtchn(int port)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	sync_clear_bit(port, s->evtchn_pending);
+}
+
+static inline void set_evtchn(int port)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, s->evtchn_pending);
+}
+
+static inline int test_evtchn(int port)
+{
+	shared_info_t *s = HYPERVISOR_shared_info;
+	return sync_test_bit(port, s->evtchn_pending);
+}
+
+static inline void notify_remote_via_evtchn(int port)
+{
+	struct evtchn_send send = { .port = port };
+	VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
+}
+
+static inline void
+multi_notify_remote_via_evtchn(multicall_entry_t *mcl, int port)
+{
+	struct evtchn_send *send = (void *)(mcl->args + 2);
+
+	BUILD_BUG_ON(sizeof(*send) > sizeof(mcl->args) - 2 * sizeof(*mcl->args));
+	send->port = port;
+	mcl->op = __HYPERVISOR_event_channel_op;
+	mcl->args[0] = EVTCHNOP_send;
+	mcl->args[1] = (unsigned long)send;
+}
+
+static inline int close_evtchn(int port)
+{
+	struct evtchn_close close = { .port = port };
+	return HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+}
+
+/* Test an irq's pending state. */
+int xen_test_irq_pending(int irq);
 
 /*
- * Unbind previously allocated @port.
+ * Use these to access the event channel underlying the IRQ handle returned
+ * by bind_*_to_irqhandler().
  */
-#define IOCTL_EVTCHN_NOTIFY				\
-	_IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
-struct ioctl_evtchn_notify {
-	unsigned int port;
-};
+void notify_remote_via_irq(int irq);
+int multi_notify_remote_via_irq(multicall_entry_t *, int irq);
+int irq_to_evtchn_port(int irq);
 
-/* Clear and reinitialise the event buffer. Clear error condition. */
-#define IOCTL_EVTCHN_RESET				\
-	_IOC(_IOC_NONE, 'E', 5, 0)
+#if defined(CONFIG_SMP) && !defined(MODULE) && defined(CONFIG_X86)
+void notify_remote_via_ipi(unsigned int ipi, unsigned int cpu);
+void clear_ipi_evtchn(void);
+#endif
 
-#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
+#endif /* __ASM_EVTCHN_H__ */
+#endif /* CONFIG_PARAVIRT_XEN */
diff --git a/include/xen/features.h b/include/xen/features.h
index 27292d4..6c89605 100644
--- a/include/xen/features.h
+++ b/include/xen/features.h
@@ -10,6 +10,7 @@
 #define __XEN_FEATURES_H__
 
 #include <xen/interface/features.h>
+#include <xen/interface/version.h>
 
 void xen_setup_features(void);
 
@@ -20,4 +21,4 @@ static inline int xen_feature(int flag)
 	return xen_features[flag];
 }
 
-#endif /* __ASM_XEN_FEATURES_H__ */
+#endif /* __XEN_FEATURES_H__ */
diff --git a/include/xen/firmware.h b/include/xen/firmware.h
new file mode 100644
index 0000000..3be378c
--- /dev/null
+++ b/include/xen/firmware.h
@@ -0,0 +1,14 @@
+#ifndef __XEN_FIRMWARE_H__
+#define __XEN_FIRMWARE_H__
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+void copy_edd(void);
+#endif
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+void copy_edid(void);
+#else
+static inline void copy_edid(void) {}
+#endif
+
+#endif /* __XEN_FIRMWARE_H__ */
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
index 5304bd3..ce4936d 100644
--- a/include/xen/gntdev.h
+++ b/include/xen/gntdev.h
@@ -1,150 +1,3 @@
-/******************************************************************************
- * gntdev.h
- * 
- * Interface to /dev/xen/gntdev.
- * 
- * Copyright (c) 2007, D G Murray
- * 
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef __LINUX_PUBLIC_GNTDEV_H__
-#define __LINUX_PUBLIC_GNTDEV_H__
-
-struct ioctl_gntdev_grant_ref {
-	/* The domain ID of the grant to be mapped. */
-	uint32_t domid;
-	/* The grant reference of the grant to be mapped. */
-	uint32_t ref;
-};
-
-/*
- * Inserts the grant references into the mapping table of an instance
- * of gntdev. N.B. This does not perform the mapping, which is deferred
- * until mmap() is called with @index as the offset.
- */
-#define IOCTL_GNTDEV_MAP_GRANT_REF \
-_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
-struct ioctl_gntdev_map_grant_ref {
-	/* IN parameters */
-	/* The number of grants to be mapped. */
-	uint32_t count;
-	uint32_t pad;
-	/* OUT parameters */
-	/* The offset to be used on a subsequent call to mmap(). */
-	uint64_t index;
-	/* Variable IN parameter. */
-	/* Array of grant references, of size @count. */
-	struct ioctl_gntdev_grant_ref refs[1];
-};
-
-/*
- * Removes the grant references from the mapping table of an instance of
- * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
- * before this ioctl is called, or an error will result.
- */
-#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
-_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
-struct ioctl_gntdev_unmap_grant_ref {
-	/* IN parameters */
-	/* The offset was returned by the corresponding map operation. */
-	uint64_t index;
-	/* The number of pages to be unmapped. */
-	uint32_t count;
-	uint32_t pad;
-};
-
-/*
- * Returns the offset in the driver's address space that corresponds
- * to @vaddr. This can be used to perform a munmap(), followed by an
- * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
- * the caller. The number of pages that were allocated at the same time as
- * @vaddr is returned in @count.
- *
- * N.B. Where more than one page has been mapped into a contiguous range, the
- *      supplied @vaddr must correspond to the start of the range; otherwise
- *      an error will result. It is only possible to munmap() the entire
- *      contiguously-allocated range at once, and not any subrange thereof.
- */
-#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
-_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
-struct ioctl_gntdev_get_offset_for_vaddr {
-	/* IN parameters */
-	/* The virtual address of the first mapped page in a range. */
-	uint64_t vaddr;
-	/* OUT parameters */
-	/* The offset that was used in the initial mmap() operation. */
-	uint64_t offset;
-	/* The number of pages mapped in the VM area that begins at @vaddr. */
-	uint32_t count;
-	uint32_t pad;
-};
-
-/*
- * Sets the maximum number of grants that may mapped at once by this gntdev
- * instance.
- *
- * N.B. This must be called before any other ioctl is performed on the device.
- */
-#define IOCTL_GNTDEV_SET_MAX_GRANTS \
-_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
-struct ioctl_gntdev_set_max_grants {
-	/* IN parameter */
-	/* The maximum number of grants that may be mapped at once. */
-	uint32_t count;
-};
-
-/*
- * Sets up an unmap notification within the page, so that the other side can do
- * cleanup if this side crashes. Required to implement cross-domain robust
- * mutexes or close notification on communication channels.
- *
- * Each mapped page only supports one notification; multiple calls referring to
- * the same page overwrite the previous notification. You must clear the
- * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
- * to occur.
- */
-#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \
-_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify))
-struct ioctl_gntdev_unmap_notify {
-	/* IN parameters */
-	/* Offset in the file descriptor for a byte within the page (same as
-	 * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
-	 * be cleared. Otherwise, it can be any byte in the page whose
-	 * notification we are adjusting.
-	 */
-	uint64_t index;
-	/* Action(s) to take on unmap */
-	uint32_t action;
-	/* Event channel to notify */
-	uint32_t event_channel_port;
-};
-
-/* Clear (set to zero) the byte specified by index */
-#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
-/* Send an interrupt on the indicated event channel */
-#define UNMAP_NOTIFY_SEND_EVENT 0x2
-
-#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
+#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__)
+#include "public/gntdev.h"
+#endif
diff --git a/include/xen/gnttab.h b/include/xen/gnttab.h
new file mode 100644
index 0000000..be3d2f3
--- /dev/null
+++ b/include/xen/gnttab.h
@@ -0,0 +1,207 @@
+/******************************************************************************
+ * gnttab.h
+ * 
+ * Two sets of functionality:
+ * 1. Granting foreign access to our memory reservation.
+ * 2. Accessing others' memory reservations via grant references.
+ * (i.e., mechanisms for both sender and recipient of grant references)
+ * 
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2005, Christopher Clark
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __ASM_GNTTAB_H__
+#define __ASM_GNTTAB_H__
+
+#include <asm/hypervisor.h>
+#include <asm/maddr.h> /* maddr_t */
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <xen/interface/grant_table.h>
+#include <xen/features.h>
+
+struct gnttab_free_callback {
+	struct gnttab_free_callback *next;
+	void (*fn)(void *);
+	void *arg;
+	u16 count;
+	u8 queued;
+};
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+				int flags);
+
+/*
+ * End access through the given grant reference, iff the grant entry is no
+ * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
+ * use.
+ */
+int gnttab_end_foreign_access_ref(grant_ref_t ref);
+
+/*
+ * Eventually end access through the given grant reference, and once that
+ * access has been ended, free the given page too.  Access will be ended
+ * immediately iff the grant entry is not in use, otherwise it will happen
+ * some time later.  page may be 0, in which case no freeing will occur.
+ */
+void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page);
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
+
+int gnttab_query_foreign_access(grant_ref_t ref);
+
+/*
+ * operations on reserved batches of grant references
+ */
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
+
+void gnttab_free_grant_reference(grant_ref_t ref);
+
+void gnttab_free_grant_references(grant_ref_t head);
+
+int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
+
+int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+				    grant_ref_t release);
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+				  void (*fn)(void *), void *arg, u16 count);
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
+
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+				     unsigned long frame, int flags);
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
+				       unsigned long pfn);
+
+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+void __gnttab_dma_map_page(struct page *page);
+#else
+#define __gnttab_dma_map_page __gnttab_dma_unmap_page
+#endif
+static inline void __gnttab_dma_unmap_page(struct page *page)
+{
+}
+
+void gnttab_reset_grant_page(struct page *page);
+
+#ifndef CONFIG_XEN
+int gnttab_resume(void);
+#endif
+
+void *arch_gnttab_alloc_shared(unsigned long *frames);
+
+static inline void
+gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
+		  uint32_t flags, grant_ref_t ref, domid_t domid)
+{
+	if (flags & GNTMAP_contains_pte)
+		map->host_addr = addr;
+	else if (xen_feature(XENFEAT_auto_translated_physmap))
+		map->host_addr = __pa(addr);
+	else
+		map->host_addr = addr;
+
+	map->flags = flags;
+	map->ref = ref;
+	map->dom = domid;
+}
+
+static inline void
+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
+		    uint32_t flags, grant_handle_t handle)
+{
+	if (flags & GNTMAP_contains_pte)
+		unmap->host_addr = addr;
+	else if (xen_feature(XENFEAT_auto_translated_physmap))
+		unmap->host_addr = __pa(addr);
+	else
+		unmap->host_addr = addr;
+
+	unmap->handle = handle;
+	unmap->dev_bus_addr = 0;
+}
+
+static inline void
+gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr,
+		      maddr_t new_addr, grant_handle_t handle)
+{
+	if (xen_feature(XENFEAT_auto_translated_physmap)) {
+		unmap->host_addr = __pa(addr);
+		unmap->new_addr = __pa(new_addr);
+	} else {
+		unmap->host_addr = addr;
+		unmap->new_addr = new_addr;
+	}
+
+	unmap->handle = handle;
+}
+
+#define gnttab_check_GNTST_eagain_while(__HCop, __HCarg_p)			\
+{										\
+	u8 __hc_delay = 1;							\
+	int __ret;								\
+	while (unlikely((__HCarg_p)->status == GNTST_eagain && __hc_delay)) {	\
+		msleep(__hc_delay++);						\
+		__ret = HYPERVISOR_grant_table_op(__HCop, (__HCarg_p), 1);	\
+		BUG_ON(__ret);							\
+	}									\
+	if (__hc_delay == 0) {							\
+		pr_err("%s: %s gnt busy\n", __func__, current->comm);		\
+		(__HCarg_p)->status = GNTST_bad_page;				\
+	}									\
+	if ((__HCarg_p)->status != GNTST_okay)					\
+		pr_err("%s: %s gnt status %x\n", 				\
+			__func__, current->comm, (__HCarg_p)->status);		\
+}
+
+#define gnttab_check_GNTST_eagain_do_while(__HCop, __HCarg_p)			\
+{										\
+	u8 __hc_delay = 1;							\
+	int __ret;								\
+	do {									\
+		__ret = HYPERVISOR_grant_table_op(__HCop, (__HCarg_p), 1);	\
+		BUG_ON(__ret);							\
+		if ((__HCarg_p)->status == GNTST_eagain)			\
+			msleep(__hc_delay++);					\
+	} while ((__HCarg_p)->status == GNTST_eagain && __hc_delay);		\
+	if (__hc_delay == 0) {							\
+		pr_err("%s: %s gnt busy\n", __func__, current->comm);		\
+		(__HCarg_p)->status = GNTST_bad_page;				\
+	}									\
+	if ((__HCarg_p)->status != GNTST_okay)					\
+		pr_err("%s: %s gnt status %x\n", 				\
+			__func__, current->comm, (__HCarg_p)->status);		\
+}
+
+#endif /* __ASM_GNTTAB_H__ */
diff --git a/include/xen/hvm.h b/include/xen/hvm.h
index b193fa2..b883740 100644
--- a/include/xen/hvm.h
+++ b/include/xen/hvm.h
@@ -3,7 +3,9 @@
 #define XEN_HVM_H__
 
 #include <xen/interface/hvm/params.h>
+#ifndef HAVE_XEN_PLATFORM_COMPAT_H
 #include <asm/xen/hypercall.h>
+#endif
 
 static inline int hvm_get_parameter(int idx, uint64_t *value)
 {
@@ -14,8 +16,7 @@ static inline int hvm_get_parameter(int idx, uint64_t *value)
 	xhv.index = idx;
 	r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
 	if (r < 0) {
-		printk(KERN_ERR "Cannot get hvm parameter %d: %d!\n",
-			idx, r);
+		pr_err("Cannot get hvm parameter %d: %d!\n", idx, r);
 		return r;
 	}
 	*value = xhv.value;
diff --git a/include/xen/hypercall.h b/include/xen/hypercall.h
new file mode 100644
index 0000000..62071ea
--- /dev/null
+++ b/include/xen/hypercall.h
@@ -0,0 +1,30 @@
+#ifndef __XEN_HYPERCALL_H__
+#define __XEN_HYPERCALL_H__
+
+#include <asm/hypercall.h>
+
+static inline int __must_check
+HYPERVISOR_multicall_check(
+	multicall_entry_t *call_list, unsigned int nr_calls,
+	const unsigned long *rc_list)
+{
+	int rc = HYPERVISOR_multicall(call_list, nr_calls);
+
+	if (unlikely(rc < 0))
+		return rc;
+	BUG_ON(rc);
+	BUG_ON((int)nr_calls < 0);
+
+	for ( ; nr_calls > 0; --nr_calls, ++call_list)
+		if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0)))
+			return nr_calls;
+
+	return 0;
+}
+
+/* A construct to ignore the return value of hypercall wrappers in a few
+ * exceptional cases (simply casting the function result to void doesn't
+ * avoid the compiler warning): */
+#define VOID(expr) ((void)((expr)?:0))
+
+#endif /* __XEN_HYPERCALL_H__ */
diff --git a/include/xen/interface/COPYING b/include/xen/interface/COPYING
new file mode 100644
index 0000000..ffc6d61
--- /dev/null
+++ b/include/xen/interface/COPYING
@@ -0,0 +1,38 @@
+XEN NOTICE
+==========
+
+This copyright applies to all files within this subdirectory and its
+subdirectories:
+  include/public/*.h
+  include/public/hvm/*.h
+  include/public/io/*.h
+
+The intention is that these files can be freely copied into the source
+tree of an operating system when porting that OS to run on Xen. Doing
+so does *not* cause the OS to become subject to the terms of the GPL.
+
+All other files in the Xen source distribution are covered by version
+2 of the GNU General Public License except where explicitly stated
+otherwise within individual source files.
+
+ -- Keir Fraser (on behalf of the Xen team)
+
+=====================================================================
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+DEALINGS IN THE SOFTWARE.
diff --git a/include/xen/interface/arch-x86/cpuid.h b/include/xen/interface/arch-x86/cpuid.h
new file mode 100644
index 0000000..d9bd627
--- /dev/null
+++ b/include/xen/interface/arch-x86/cpuid.h
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * arch-x86/cpuid.h
+ * 
+ * CPUID interface to Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ * 
+ * Copyright (c) 2007 Citrix Systems, Inc.
+ * 
+ * Authors:
+ *    Keir Fraser <keir@xen.org>
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__
+#define __XEN_PUBLIC_ARCH_X86_CPUID_H__
+
+/* Xen identification leaves start at 0x40000000. */
+#define XEN_CPUID_FIRST_LEAF 0x40000000
+#define XEN_CPUID_LEAF(i)    (XEN_CPUID_FIRST_LEAF + (i))
+
+/*
+ * Leaf 1 (0x40000000)
+ * EAX: Largest Xen-information leaf. All leaves up to an including @EAX
+ *      are supported by the Xen host.
+ * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification
+ *      of a Xen host.
+ */
+#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */
+#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */
+#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */
+
+/*
+ * Leaf 2 (0x40000001)
+ * EAX[31:16]: Xen major version.
+ * EAX[15: 0]: Xen minor version.
+ * EBX-EDX: Reserved (currently all zeroes).
+ */
+
+/*
+ * Leaf 3 (0x40000002)
+ * EAX: Number of hypercall transfer pages. This register is always guaranteed
+ *      to specify one hypercall page.
+ * EBX: Base address of Xen-specific MSRs.
+ * ECX: Features 1. Unused bits are set to zero.
+ * EDX: Features 2. Unused bits are set to zero.
+ */
+
+/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */
+#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
+#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD  (1u<<0)
+
+#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/include/xen/interface/arch-x86/hvm/save.h b/include/xen/interface/arch-x86/hvm/save.h
new file mode 100644
index 0000000..ac7122e
--- /dev/null
+++ b/include/xen/interface/arch-x86/hvm/save.h
@@ -0,0 +1,583 @@
+/* 
+ * Structure definitions for HVM state that is held by Xen and must
+ * be saved along with the domain's memory and device-model state.
+ * 
+ * Copyright (c) 2007 XenSource Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_SAVE_X86_H__
+#define __XEN_PUBLIC_HVM_SAVE_X86_H__
+
+/* 
+ * Save/restore header: general info about the save file. 
+ */
+
+#define HVM_FILE_MAGIC   0x54381286
+#define HVM_FILE_VERSION 0x00000001
+
+struct hvm_save_header {
+    uint32_t magic;             /* Must be HVM_FILE_MAGIC */
+    uint32_t version;           /* File format version */
+    uint64_t changeset;         /* Version of Xen that saved this file */
+    uint32_t cpuid;             /* CPUID[0x01][%eax] on the saving machine */
+    uint32_t gtsc_khz;        /* Guest's TSC frequency in kHz */
+};
+
+DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);
+
+
+/*
+ * Processor
+ *
+ * Compat: Pre-3.4 didn't have msr_tsc_aux
+ */
+
+struct hvm_hw_cpu {
+    uint8_t  fpu_regs[512];
+
+    uint64_t rax;
+    uint64_t rbx;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t rsp;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+
+    uint64_t rip;
+    uint64_t rflags;
+
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+
+    uint64_t dr0;
+    uint64_t dr1;
+    uint64_t dr2;
+    uint64_t dr3;
+    uint64_t dr6;
+    uint64_t dr7;    
+
+    uint32_t cs_sel;
+    uint32_t ds_sel;
+    uint32_t es_sel;
+    uint32_t fs_sel;
+    uint32_t gs_sel;
+    uint32_t ss_sel;
+    uint32_t tr_sel;
+    uint32_t ldtr_sel;
+
+    uint32_t cs_limit;
+    uint32_t ds_limit;
+    uint32_t es_limit;
+    uint32_t fs_limit;
+    uint32_t gs_limit;
+    uint32_t ss_limit;
+    uint32_t tr_limit;
+    uint32_t ldtr_limit;
+    uint32_t idtr_limit;
+    uint32_t gdtr_limit;
+
+    uint64_t cs_base;
+    uint64_t ds_base;
+    uint64_t es_base;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    uint64_t ss_base;
+    uint64_t tr_base;
+    uint64_t ldtr_base;
+    uint64_t idtr_base;
+    uint64_t gdtr_base;
+
+    uint32_t cs_arbytes;
+    uint32_t ds_arbytes;
+    uint32_t es_arbytes;
+    uint32_t fs_arbytes;
+    uint32_t gs_arbytes;
+    uint32_t ss_arbytes;
+    uint32_t tr_arbytes;
+    uint32_t ldtr_arbytes;
+
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+
+    /* msr for em64t */
+    uint64_t shadow_gs;
+
+    /* msr content saved/restored. */
+    uint64_t msr_flags;
+    uint64_t msr_lstar;
+    uint64_t msr_star;
+    uint64_t msr_cstar;
+    uint64_t msr_syscall_mask;
+    uint64_t msr_efer;
+    uint64_t msr_tsc_aux;
+
+    /* guest's idea of what rdtsc() would return */
+    uint64_t tsc;
+
+    /* pending event, if any */
+    union {
+        uint32_t pending_event;
+        struct {
+            uint8_t  pending_vector:8;
+            uint8_t  pending_type:3;
+            uint8_t  pending_error_valid:1;
+            uint32_t pending_reserved:19;
+            uint8_t  pending_valid:1;
+        };
+    };
+    /* error code for pending event */
+    uint32_t error_code;
+};
+
+struct hvm_hw_cpu_compat {
+    uint8_t  fpu_regs[512];
+
+    uint64_t rax;
+    uint64_t rbx;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t rsp;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+
+    uint64_t rip;
+    uint64_t rflags;
+
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+
+    uint64_t dr0;
+    uint64_t dr1;
+    uint64_t dr2;
+    uint64_t dr3;
+    uint64_t dr6;
+    uint64_t dr7;    
+
+    uint32_t cs_sel;
+    uint32_t ds_sel;
+    uint32_t es_sel;
+    uint32_t fs_sel;
+    uint32_t gs_sel;
+    uint32_t ss_sel;
+    uint32_t tr_sel;
+    uint32_t ldtr_sel;
+
+    uint32_t cs_limit;
+    uint32_t ds_limit;
+    uint32_t es_limit;
+    uint32_t fs_limit;
+    uint32_t gs_limit;
+    uint32_t ss_limit;
+    uint32_t tr_limit;
+    uint32_t ldtr_limit;
+    uint32_t idtr_limit;
+    uint32_t gdtr_limit;
+
+    uint64_t cs_base;
+    uint64_t ds_base;
+    uint64_t es_base;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    uint64_t ss_base;
+    uint64_t tr_base;
+    uint64_t ldtr_base;
+    uint64_t idtr_base;
+    uint64_t gdtr_base;
+
+    uint32_t cs_arbytes;
+    uint32_t ds_arbytes;
+    uint32_t es_arbytes;
+    uint32_t fs_arbytes;
+    uint32_t gs_arbytes;
+    uint32_t ss_arbytes;
+    uint32_t tr_arbytes;
+    uint32_t ldtr_arbytes;
+
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+
+    /* msr for em64t */
+    uint64_t shadow_gs;
+
+    /* msr content saved/restored. */
+    uint64_t msr_flags;
+    uint64_t msr_lstar;
+    uint64_t msr_star;
+    uint64_t msr_cstar;
+    uint64_t msr_syscall_mask;
+    uint64_t msr_efer;
+    /*uint64_t msr_tsc_aux; COMPAT */
+
+    /* guest's idea of what rdtsc() would return */
+    uint64_t tsc;
+
+    /* pending event, if any */
+    union {
+        uint32_t pending_event;
+        struct {
+            uint8_t  pending_vector:8;
+            uint8_t  pending_type:3;
+            uint8_t  pending_error_valid:1;
+            uint32_t pending_reserved:19;
+            uint8_t  pending_valid:1;
+        };
+    };
+    /* error code for pending event */
+    uint32_t error_code;
+};
+
+static inline int _hvm_hw_fix_cpu(void *h) {
+    struct hvm_hw_cpu *new=h;
+    struct hvm_hw_cpu_compat *old=h;
+
+    /* If we copy from the end backwards, we should
+     * be able to do the modification in-place */
+    new->error_code=old->error_code;
+    new->pending_event=old->pending_event;
+    new->tsc=old->tsc;
+    new->msr_tsc_aux=0;
+
+    return 0;
+}
+
+DECLARE_HVM_SAVE_TYPE_COMPAT(CPU, 2, struct hvm_hw_cpu, \
+                             struct hvm_hw_cpu_compat, _hvm_hw_fix_cpu);
+
+/*
+ * PIC
+ */
+
+struct hvm_hw_vpic {
+    /* IR line bitmasks. */
+    uint8_t irr;
+    uint8_t imr;
+    uint8_t isr;
+
+    /* Line IRx maps to IRQ irq_base+x */
+    uint8_t irq_base;
+
+    /*
+     * Where are we in ICW2-4 initialisation (0 means no init in progress)?
+     * Bits 0-1 (=x): Next write at A=1 sets ICW(x+1).
+     * Bit 2: ICW1.IC4  (1 == ICW4 included in init sequence)
+     * Bit 3: ICW1.SNGL (0 == ICW3 included in init sequence)
+     */
+    uint8_t init_state:4;
+
+    /* IR line with highest priority. */
+    uint8_t priority_add:4;
+
+    /* Reads from A=0 obtain ISR or IRR? */
+    uint8_t readsel_isr:1;
+
+    /* Reads perform a polling read? */
+    uint8_t poll:1;
+
+    /* Automatically clear IRQs from the ISR during INTA? */
+    uint8_t auto_eoi:1;
+
+    /* Automatically rotate IRQ priorities during AEOI? */
+    uint8_t rotate_on_auto_eoi:1;
+
+    /* Exclude slave inputs when considering in-service IRQs? */
+    uint8_t special_fully_nested_mode:1;
+
+    /* Special mask mode excludes masked IRs from AEOI and priority checks. */
+    uint8_t special_mask_mode:1;
+
+    /* Is this a master PIC or slave PIC? (NB. This is not programmable.) */
+    uint8_t is_master:1;
+
+    /* Edge/trigger selection. */
+    uint8_t elcr;
+
+    /* Virtual INT output. */
+    uint8_t int_output;
+};
+
+DECLARE_HVM_SAVE_TYPE(PIC, 3, struct hvm_hw_vpic);
+
+
+/*
+ * IO-APIC
+ */
+
+#define VIOAPIC_NUM_PINS  48 /* 16 ISA IRQs, 32 non-legacy PCI IRQS. */
+
+struct hvm_hw_vioapic {
+    uint64_t base_address;
+    uint32_t ioregsel;
+    uint32_t id;
+    union vioapic_redir_entry
+    {
+        uint64_t bits;
+        struct {
+            uint8_t vector;
+            uint8_t delivery_mode:3;
+            uint8_t dest_mode:1;
+            uint8_t delivery_status:1;
+            uint8_t polarity:1;
+            uint8_t remote_irr:1;
+            uint8_t trig_mode:1;
+            uint8_t mask:1;
+            uint8_t reserve:7;
+            uint8_t reserved[4];
+            uint8_t dest_id;
+        } fields;
+    } redirtbl[VIOAPIC_NUM_PINS];
+};
+
+DECLARE_HVM_SAVE_TYPE(IOAPIC, 4, struct hvm_hw_vioapic);
+
+
+/*
+ * LAPIC
+ */
+
+struct hvm_hw_lapic {
+    uint64_t             apic_base_msr;
+    uint32_t             disabled; /* VLAPIC_xx_DISABLED */
+    uint32_t             timer_divisor;
+    uint64_t             tdt_msr;
+};
+
+DECLARE_HVM_SAVE_TYPE(LAPIC, 5, struct hvm_hw_lapic);
+
+struct hvm_hw_lapic_regs {
+    uint8_t data[1024];
+};
+
+DECLARE_HVM_SAVE_TYPE(LAPIC_REGS, 6, struct hvm_hw_lapic_regs);
+
+
+/*
+ * IRQs
+ */
+
+struct hvm_hw_pci_irqs {
+    /*
+     * Virtual interrupt wires for a single PCI bus.
+     * Indexed by: device*4 + INTx#.
+     */
+    union {
+        unsigned long i[16 / sizeof (unsigned long)]; /* DECLARE_BITMAP(i, 32*4); */
+        uint64_t pad[2];
+    };
+};
+
+DECLARE_HVM_SAVE_TYPE(PCI_IRQ, 7, struct hvm_hw_pci_irqs);
+
+struct hvm_hw_isa_irqs {
+    /*
+     * Virtual interrupt wires for ISA devices.
+     * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing).
+     */
+    union {
+        unsigned long i[1];  /* DECLARE_BITMAP(i, 16); */
+        uint64_t pad[1];
+    };
+};
+
+DECLARE_HVM_SAVE_TYPE(ISA_IRQ, 8, struct hvm_hw_isa_irqs);
+
+struct hvm_hw_pci_link {
+    /*
+     * PCI-ISA interrupt router.
+     * Each PCI <device:INTx#> is 'wire-ORed' into one of four links using
+     * the traditional 'barber's pole' mapping ((device + INTx#) & 3).
+     * The router provides a programmable mapping from each link to a GSI.
+     */
+    uint8_t route[4];
+    uint8_t pad0[4];
+};
+
+DECLARE_HVM_SAVE_TYPE(PCI_LINK, 9, struct hvm_hw_pci_link);
+
+/* 
+ *  PIT
+ */
+
+struct hvm_hw_pit {
+    struct hvm_hw_pit_channel {
+        uint32_t count; /* can be 65536 */
+        uint16_t latched_count;
+        uint8_t count_latched;
+        uint8_t status_latched;
+        uint8_t status;
+        uint8_t read_state;
+        uint8_t write_state;
+        uint8_t write_latch;
+        uint8_t rw_mode;
+        uint8_t mode;
+        uint8_t bcd; /* not supported */
+        uint8_t gate; /* timer start */
+    } channels[3];  /* 3 x 16 bytes */
+    uint32_t speaker_data_on;
+    uint32_t pad0;
+};
+
+DECLARE_HVM_SAVE_TYPE(PIT, 10, struct hvm_hw_pit);
+
+
+/* 
+ * RTC
+ */ 
+
+#define RTC_CMOS_SIZE 14
+struct hvm_hw_rtc {
+    /* CMOS bytes */
+    uint8_t cmos_data[RTC_CMOS_SIZE];
+    /* Index register for 2-part operations */
+    uint8_t cmos_index;
+    uint8_t pad0;
+};
+
+DECLARE_HVM_SAVE_TYPE(RTC, 11, struct hvm_hw_rtc);
+
+
+/*
+ * HPET
+ */
+
+#define HPET_TIMER_NUM     3    /* 3 timers supported now */
+struct hvm_hw_hpet {
+    /* Memory-mapped, software visible registers */
+    uint64_t capability;        /* capabilities */
+    uint64_t res0;              /* reserved */
+    uint64_t config;            /* configuration */
+    uint64_t res1;              /* reserved */
+    uint64_t isr;               /* interrupt status reg */
+    uint64_t res2[25];          /* reserved */
+    uint64_t mc64;              /* main counter */
+    uint64_t res3;              /* reserved */
+    struct {                    /* timers */
+        uint64_t config;        /* configuration/cap */
+        uint64_t cmp;           /* comparator */
+        uint64_t fsb;           /* FSB route, not supported now */
+        uint64_t res4;          /* reserved */
+    } timers[HPET_TIMER_NUM];
+    uint64_t res5[4*(24-HPET_TIMER_NUM)];  /* reserved, up to 0x3ff */
+
+    /* Hidden register state */
+    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
+};
+
+DECLARE_HVM_SAVE_TYPE(HPET, 12, struct hvm_hw_hpet);
+
+
+/*
+ * PM timer
+ */
+
+struct hvm_hw_pmtimer {
+    uint32_t tmr_val;   /* PM_TMR_BLK.TMR_VAL: 32bit free-running counter */
+    uint16_t pm1a_sts;  /* PM1a_EVT_BLK.PM1a_STS: status register */
+    uint16_t pm1a_en;   /* PM1a_EVT_BLK.PM1a_EN: enable register */
+};
+
+DECLARE_HVM_SAVE_TYPE(PMTIMER, 13, struct hvm_hw_pmtimer);
+
+/*
+ * MTRR MSRs
+ */
+
+struct hvm_hw_mtrr {
+#define MTRR_VCNT 8
+#define NUM_FIXED_MSR 11
+    uint64_t msr_pat_cr;
+    /* mtrr physbase & physmask msr pair*/
+    uint64_t msr_mtrr_var[MTRR_VCNT*2];
+    uint64_t msr_mtrr_fixed[NUM_FIXED_MSR];
+    uint64_t msr_mtrr_cap;
+    uint64_t msr_mtrr_def_type;
+};
+
+DECLARE_HVM_SAVE_TYPE(MTRR, 14, struct hvm_hw_mtrr);
+
+/*
+ * The save area of XSAVE/XRSTOR.
+ */
+
+struct hvm_hw_cpu_xsave {
+    uint64_t xfeature_mask;
+    uint64_t xcr0;                 /* Updated by XSETBV */
+    uint64_t xcr0_accum;           /* Updated by XSETBV */
+    struct {
+        struct { char x[512]; } fpu_sse;
+
+        struct {
+            uint64_t xstate_bv;         /* Updated by XRSTOR */
+            uint64_t reserved[7];
+        } xsave_hdr;                    /* The 64-byte header */
+
+        struct { char x[0]; } ymm;    /* YMM */
+    } save_area;
+};
+
+#define CPU_XSAVE_CODE  16
+
+/*
+ * Viridian hypervisor context.
+ */
+
+struct hvm_viridian_domain_context {
+    uint64_t hypercall_gpa;
+    uint64_t guest_os_id;
+};
+
+DECLARE_HVM_SAVE_TYPE(VIRIDIAN_DOMAIN, 15, struct hvm_viridian_domain_context);
+
+struct hvm_viridian_vcpu_context {
+    uint64_t apic_assist;
+};
+
+DECLARE_HVM_SAVE_TYPE(VIRIDIAN_VCPU, 17, struct hvm_viridian_vcpu_context);
+
+/* 
+ * Largest type-code in use
+ */
+#define HVM_SAVE_CODE_MAX 17
+
+#endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
diff --git a/include/xen/interface/arch-x86/xen-mca.h b/include/xen/interface/arch-x86/xen-mca.h
new file mode 100644
index 0000000..dca6b3e
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen-mca.h
@@ -0,0 +1,440 @@
+/******************************************************************************
+ * arch-x86/mca.h
+ * 
+ * Contributed by Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@amd.com>
+ *
+ * Guest OS machine check interface to x86 Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Full MCA functionality has the following Usecases from the guest side:
+ *
+ * Must have's:
+ * 1. Dom0 and DomU register machine check trap callback handlers
+ *    (already done via "set_trap_table" hypercall)
+ * 2. Dom0 registers machine check event callback handler
+ *    (doable via EVTCHNOP_bind_virq)
+ * 3. Dom0 and DomU fetches machine check data
+ * 4. Dom0 wants Xen to notify a DomU
+ * 5. Dom0 gets DomU ID from physical address
+ * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy")
+ *
+ * Nice to have's:
+ * 7. Dom0 wants Xen to deactivate a physical CPU
+ *    This is better done as separate task, physical CPU hotplugging,
+ *    and hypercall(s) should be sysctl's
+ * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to
+ *    move a DomU (or Dom0 itself) away from a malicious page
+ *    producing correctable errors.
+ * 9. offlining physical page:
+ *    Xen free's and never re-uses a certain physical page.
+ * 10. Testfacility: Allow Dom0 to write values into machine check MSR's
+ *     and tell Xen to trigger a machine check
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
+#define __XEN_PUBLIC_ARCH_X86_MCA_H__
+
+/* Hypercall */
+#define __HYPERVISOR_mca __HYPERVISOR_arch_0
+
+/*
+ * The xen-unstable repo has interface version 0x03000001; out interface
+ * is incompatible with that and any future minor revisions, so we
+ * choose a different version number range that is numerically less
+ * than that used in xen-unstable.
+ */
+#define XEN_MCA_INTERFACE_VERSION 0x01ecc003
+
+/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */
+#define XEN_MC_NONURGENT  0x0001
+/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */
+#define XEN_MC_URGENT     0x0002
+/* IN: Dom0 acknowledges previosly-fetched telemetry */
+#define XEN_MC_ACK        0x0004
+
+/* OUT: All is ok */
+#define XEN_MC_OK           0x0
+/* OUT: Domain could not fetch data. */
+#define XEN_MC_FETCHFAILED  0x1
+/* OUT: There was no machine check data to fetch. */
+#define XEN_MC_NODATA       0x2
+/* OUT: Between notification time and this hypercall an other
+ *  (most likely) correctable error happened. The fetched data,
+ *  does not match the original machine check data. */
+#define XEN_MC_NOMATCH      0x4
+
+/* OUT: DomU did not register MC NMI handler. Try something else. */
+#define XEN_MC_CANNOTHANDLE 0x8
+/* OUT: Notifying DomU failed. Retry later or try something else. */
+#define XEN_MC_NOTDELIVERED 0x10
+/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */
+
+
+#ifndef __ASSEMBLY__
+
+#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
+
+/*
+ * Machine Check Architecure:
+ * structs are read-only and used to report all kinds of
+ * correctable and uncorrectable errors detected by the HW.
+ * Dom0 and DomU: register a handler to get notified.
+ * Dom0 only: Correctable errors are reported via VIRQ_MCA
+ * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers
+ */
+#define MC_TYPE_GLOBAL          0
+#define MC_TYPE_BANK            1
+#define MC_TYPE_EXTENDED        2
+#define MC_TYPE_RECOVERY        3
+
+struct mcinfo_common {
+    uint16_t type;      /* structure type */
+    uint16_t size;      /* size of this struct in bytes */
+};
+
+
+#define MC_FLAG_CORRECTABLE     (1 << 0)
+#define MC_FLAG_UNCORRECTABLE   (1 << 1)
+#define MC_FLAG_RECOVERABLE	(1 << 2)
+#define MC_FLAG_POLLED		(1 << 3)
+#define MC_FLAG_RESET		(1 << 4)
+#define MC_FLAG_CMCI		(1 << 5)
+#define MC_FLAG_MCE		(1 << 6)
+/* contains global x86 mc information */
+struct mcinfo_global {
+    struct mcinfo_common common;
+
+    /* running domain at the time in error (most likely the impacted one) */
+    uint16_t mc_domid;
+    uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
+    uint32_t mc_socketid; /* physical socket of the physical core */
+    uint16_t mc_coreid; /* physical impacted core */
+    uint16_t mc_core_threadid; /* core thread of physical core */
+    uint32_t mc_apicid;
+    uint32_t mc_flags;
+    uint64_t mc_gstatus; /* global status */
+};
+
+/* contains bank local x86 mc information */
+struct mcinfo_bank {
+    struct mcinfo_common common;
+
+    uint16_t mc_bank; /* bank nr */
+    uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0
+                        * and if mc_addr is valid. Never valid on DomU. */
+    uint64_t mc_status; /* bank status */
+    uint64_t mc_addr;   /* bank address, only valid
+                         * if addr bit is set in mc_status */
+    uint64_t mc_misc;
+    uint64_t mc_ctrl2;
+    uint64_t mc_tsc;
+};
+
+
+struct mcinfo_msr {
+    uint64_t reg;   /* MSR */
+    uint64_t value; /* MSR value */
+};
+
+/* contains mc information from other
+ * or additional mc MSRs */ 
+struct mcinfo_extended {
+    struct mcinfo_common common;
+
+    /* You can fill up to five registers.
+     * If you need more, then use this structure
+     * multiple times. */
+
+    uint32_t mc_msrs; /* Number of msr with valid values. */
+    /*
+     * Currently Intel extended MSR (32/64) include all gp registers
+     * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be
+     * useful at present. So expand this array to 16/32 to leave room.
+     */
+    struct mcinfo_msr mc_msr[sizeof(void *) * 4];
+};
+
+/* Recovery Action flags. Giving recovery result information to DOM0 */
+
+/* Xen takes successful recovery action, the error is recovered */
+#define REC_ACTION_RECOVERED (0x1 << 0)
+/* No action is performed by XEN */
+#define REC_ACTION_NONE (0x1 << 1)
+/* It's possible DOM0 might take action ownership in some case */
+#define REC_ACTION_NEED_RESET (0x1 << 2)
+
+/* Different Recovery Action types, if the action is performed successfully,
+ * REC_ACTION_RECOVERED flag will be returned.
+ */
+
+/* Page Offline Action */
+#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
+/* CPU offline Action */
+#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
+/* L3 cache disable Action */
+#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
+
+/* Below interface used between XEN/DOM0 for passing XEN's recovery action 
+ * information to DOM0. 
+ * usage Senario: After offlining broken page, XEN might pass its page offline
+ * recovery action result to DOM0. DOM0 will save the information in 
+ * non-volatile memory for further proactive actions, such as offlining the
+ * easy broken page earlier when doing next reboot.
+*/
+struct page_offline_action
+{
+    /* Params for passing the offlined page number to DOM0 */
+    uint64_t mfn;
+    uint64_t status;
+};
+
+struct cpu_offline_action
+{
+    /* Params for passing the identity of the offlined CPU to DOM0 */
+    uint32_t mc_socketid;
+    uint16_t mc_coreid;
+    uint16_t mc_core_threadid;
+};
+
+#define MAX_UNION_SIZE 16
+struct mcinfo_recovery
+{
+    struct mcinfo_common common;
+    uint16_t mc_bank; /* bank nr */
+    uint8_t action_flags;
+    uint8_t action_types;
+    union {
+        struct page_offline_action page_retire;
+        struct cpu_offline_action cpu_offline;
+        uint8_t pad[MAX_UNION_SIZE];
+    } action_info;
+};
+
+
+#define MCINFO_HYPERCALLSIZE	1024
+#define MCINFO_MAXSIZE		768
+
+#define MCINFO_FLAGS_UNCOMPLETE 0x1
+struct mc_info {
+    /* Number of mcinfo_* entries in mi_data */
+    uint32_t mi_nentries;
+    uint32_t flags;
+    uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8];
+};
+typedef struct mc_info mc_info_t;
+DEFINE_XEN_GUEST_HANDLE(mc_info_t);
+
+#define __MC_MSR_ARRAYSIZE 8
+#define __MC_NMSRS 1
+#define MC_NCAPS	7	/* 7 CPU feature flag words */
+#define MC_CAPS_STD_EDX	0	/* cpuid level 0x00000001 (%edx) */
+#define MC_CAPS_AMD_EDX	1	/* cpuid level 0x80000001 (%edx) */
+#define MC_CAPS_TM	2	/* cpuid level 0x80860001 (TransMeta) */
+#define MC_CAPS_LINUX	3	/* Linux-defined */
+#define MC_CAPS_STD_ECX	4	/* cpuid level 0x00000001 (%ecx) */
+#define MC_CAPS_VIA	5	/* cpuid level 0xc0000001 */
+#define MC_CAPS_AMD_ECX	6	/* cpuid level 0x80000001 (%ecx) */
+
+struct mcinfo_logical_cpu {
+    uint32_t mc_cpunr;          
+    uint32_t mc_chipid; 
+    uint16_t mc_coreid;
+    uint16_t mc_threadid;
+    uint32_t mc_apicid;
+    uint32_t mc_clusterid;
+    uint32_t mc_ncores;
+    uint32_t mc_ncores_active;
+    uint32_t mc_nthreads;
+    int32_t mc_cpuid_level;
+    uint32_t mc_family;
+    uint32_t mc_vendor;
+    uint32_t mc_model;
+    uint32_t mc_step;
+    char mc_vendorid[16];
+    char mc_brandid[64];
+    uint32_t mc_cpu_caps[MC_NCAPS];
+    uint32_t mc_cache_size;
+    uint32_t mc_cache_alignment;
+    int32_t mc_nmsrvals;
+    struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
+};
+typedef struct mcinfo_logical_cpu xen_mc_logical_cpu_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_cpu_t);
+
+
+/* 
+ * OS's should use these instead of writing their own lookup function
+ * each with its own bugs and drawbacks.
+ * We use macros instead of static inline functions to allow guests
+ * to include this header in assembly files (*.S).
+ */
+/* Prototype:
+ *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
+ */
+#define x86_mcinfo_nentries(_mi)    \
+    (_mi)->mi_nentries
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
+ */
+#define x86_mcinfo_first(_mi)       \
+    ((struct mcinfo_common *)(_mi)->mi_data)
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic)       \
+    ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size))
+
+/* Prototype:
+ *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
+ */
+#define x86_mcinfo_lookup(_ret, _mi, _type)    \
+    do {                                                        \
+        uint32_t found, i;                                      \
+        struct mcinfo_common *_mic;                             \
+                                                                \
+        found = 0;                                              \
+	(_ret) = NULL;						\
+	if (_mi == NULL) break;					\
+        _mic = x86_mcinfo_first(_mi);                           \
+        for (i = 0; i < x86_mcinfo_nentries(_mi); i++) {        \
+            if (_mic->type == (_type)) {                        \
+                found = 1;                                      \
+                break;                                          \
+            }                                                   \
+            _mic = x86_mcinfo_next(_mic);                       \
+        }                                                       \
+        (_ret) = found ? _mic : NULL;                           \
+    } while (0)
+
+
+/* Usecase 1
+ * Register machine check trap callback handler
+ *    (already done via "set_trap_table" hypercall)
+ */
+
+/* Usecase 2
+ * Dom0 registers machine check event callback handler
+ * done by EVTCHNOP_bind_virq
+ */
+
+/* Usecase 3
+ * Fetch machine check data from hypervisor.
+ * Note, this hypercall is special, because both Dom0 and DomU must use this.
+ */
+#define XEN_MC_fetch            1
+struct xen_mc_fetch {
+    /* IN/OUT variables. */
+    uint32_t flags;	/* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+                           XEN_MC_ACK if ack'ing an earlier fetch */
+			/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED,
+			   XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint32_t _pad0;
+    uint64_t fetch_id;	/* OUT: id for ack, IN: id we are ack'ing */
+
+    /* OUT variables. */
+    XEN_GUEST_HANDLE(mc_info_t) data;
+};
+typedef struct xen_mc_fetch xen_mc_fetch_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
+
+
+/* Usecase 4
+ * This tells the hypervisor to notify a DomU about the machine check error
+ */
+#define XEN_MC_notifydomain     2
+struct xen_mc_notifydomain {
+    /* IN variables. */
+    uint16_t mc_domid;    /* The unprivileged domain to notify. */
+    uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
+                           * Usually echo'd value from the fetch hypercall. */
+
+    /* IN/OUT variables. */
+    uint32_t flags;
+
+/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
+/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */
+};
+typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t);
+
+#define XEN_MC_physcpuinfo 3
+struct xen_mc_physcpuinfo {
+	/* IN/OUT */
+	uint32_t ncpus;
+	uint32_t _pad0;
+	/* OUT */
+	XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
+};
+
+#define XEN_MC_msrinject    4
+#define MC_MSRINJ_MAXMSRS       8
+struct xen_mc_msrinject {
+       /* IN */
+	uint32_t mcinj_cpunr;           /* target processor id */
+	uint32_t mcinj_flags;           /* see MC_MSRINJ_F_* below */
+	uint32_t mcinj_count;           /* 0 .. count-1 in array are valid */
+	uint32_t _pad0;
+	struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
+};
+
+/* Flags for mcinj_flags above; bits 16-31 are reserved */
+#define MC_MSRINJ_F_INTERPOSE   0x1
+
+#define XEN_MC_mceinject    5
+struct xen_mc_mceinject {
+	unsigned int mceinj_cpunr;      /* target processor id */
+};
+
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+#define XEN_MC_inject_v2        6
+#define XEN_MC_INJECT_TYPE_MASK     0x7
+#define XEN_MC_INJECT_TYPE_MCE      0x0
+#define XEN_MC_INJECT_TYPE_CMCI     0x1
+
+#define XEN_MC_INJECT_CPU_BROADCAST 0x8
+
+struct xen_mc_inject_v2 {
+	uint32_t flags;
+	struct xenctl_cpumap cpumap;
+};
+#endif
+
+struct xen_mc {
+    uint32_t cmd;
+    uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
+    union {
+        struct xen_mc_fetch        mc_fetch;
+        struct xen_mc_notifydomain mc_notifydomain;
+        struct xen_mc_physcpuinfo  mc_physcpuinfo;
+        struct xen_mc_msrinject    mc_msrinject;
+        struct xen_mc_mceinject    mc_mceinject;
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+        struct xen_mc_inject_v2    mc_inject_v2;
+#endif
+    } u;
+};
+typedef struct xen_mc xen_mc_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
diff --git a/include/xen/interface/arch-x86/xen-x86_32.h b/include/xen/interface/arch-x86/xen-x86_32.h
new file mode 100644
index 0000000..de584ea
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen-x86_32.h
@@ -0,0 +1,171 @@
+/******************************************************************************
+ * xen-x86_32.h
+ * 
+ * Guest OS interface to x86 32-bit Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2007, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
+#define __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
+
+/*
+ * Hypercall interface:
+ *  Input:  %ebx, %ecx, %edx, %esi, %edi (arguments 1-5)
+ *  Output: %eax
+ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
+ *  call hypercall_page + hypercall-number * 32
+ * Clobbered: Argument registers (e.g., 2-arg hypercall clobbers %ebx,%ecx)
+ */
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS    FLAT_RING3_CS
+#define FLAT_USER_DS    FLAT_RING3_DS
+#define FLAT_USER_SS    FLAT_RING3_SS
+
+#define __HYPERVISOR_VIRT_START_PAE    0xF5800000
+#define __MACH2PHYS_VIRT_START_PAE     0xF5800000
+#define __MACH2PHYS_VIRT_END_PAE       0xF6800000
+#define HYPERVISOR_VIRT_START_PAE      \
+    mk_unsigned_long(__HYPERVISOR_VIRT_START_PAE)
+#define MACH2PHYS_VIRT_START_PAE       \
+    mk_unsigned_long(__MACH2PHYS_VIRT_START_PAE)
+#define MACH2PHYS_VIRT_END_PAE         \
+    mk_unsigned_long(__MACH2PHYS_VIRT_END_PAE)
+
+/* Non-PAE bounds are obsolete. */
+#define __HYPERVISOR_VIRT_START_NONPAE 0xFC000000
+#define __MACH2PHYS_VIRT_START_NONPAE  0xFC000000
+#define __MACH2PHYS_VIRT_END_NONPAE    0xFC400000
+#define HYPERVISOR_VIRT_START_NONPAE   \
+    mk_unsigned_long(__HYPERVISOR_VIRT_START_NONPAE)
+#define MACH2PHYS_VIRT_START_NONPAE    \
+    mk_unsigned_long(__MACH2PHYS_VIRT_START_NONPAE)
+#define MACH2PHYS_VIRT_END_NONPAE      \
+    mk_unsigned_long(__MACH2PHYS_VIRT_END_NONPAE)
+
+#define __HYPERVISOR_VIRT_START __HYPERVISOR_VIRT_START_PAE
+#define __MACH2PHYS_VIRT_START  __MACH2PHYS_VIRT_START_PAE
+#define __MACH2PHYS_VIRT_END    __MACH2PHYS_VIRT_END_PAE
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START)
+#endif
+
+/* 32-/64-bit invariability for control interfaces (domctl/sysctl). */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+#undef ___DEFINE_XEN_GUEST_HANDLE
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type)                  \
+    typedef struct { type *p; }                                 \
+        __guest_handle_ ## name;                                \
+    typedef struct { union { type *p; uint64_aligned_t q; }; }  \
+        __guest_handle_64_ ## name
+#undef set_xen_guest_handle_raw
+#define set_xen_guest_handle_raw(hnd, val)                  \
+    do { if ( sizeof(hnd) == 8 ) *(uint64_t *)&(hnd) = 0;   \
+         (hnd).p = val;                                     \
+    } while ( 0 )
+#define uint64_aligned_t uint64_t __attribute__((aligned(8)))
+#define __XEN_GUEST_HANDLE_64(name) __guest_handle_64_ ## name
+#define XEN_GUEST_HANDLE_64(name) __XEN_GUEST_HANDLE_64(name)
+#endif
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+    uint32_t ebx;
+    uint32_t ecx;
+    uint32_t edx;
+    uint32_t esi;
+    uint32_t edi;
+    uint32_t ebp;
+    uint32_t eax;
+    uint16_t error_code;    /* private */
+    uint16_t entry_vector;  /* private */
+    uint32_t eip;
+    uint16_t cs;
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad0;
+    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
+    uint32_t esp;
+    uint16_t ss, _pad1;
+    uint16_t es, _pad2;
+    uint16_t ds, _pad3;
+    uint16_t fs, _pad4;
+    uint16_t gs, _pad5;
+};
+typedef struct cpu_user_regs cpu_user_regs_t;
+DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
+};
+typedef struct arch_vcpu_info arch_vcpu_info_t;
+
+struct xen_callback {
+    unsigned long cs;
+    unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/arch-x86/xen-x86_64.h b/include/xen/interface/arch-x86/xen-x86_64.h
new file mode 100644
index 0000000..0bdd868
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen-x86_64.h
@@ -0,0 +1,202 @@
+/******************************************************************************
+ * xen-x86_64.h
+ * 
+ * Guest OS interface to x86 64-bit Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
+#define __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
+
+/*
+ * Hypercall interface:
+ *  Input:  %rdi, %rsi, %rdx, %r10, %r8 (arguments 1-5)
+ *  Output: %rax
+ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
+ *  call hypercall_page + hypercall-number * 32
+ * Clobbered: argument registers (e.g., 2-arg hypercall clobbers %rdi,%rsi)
+ */
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000  /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS   FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS   FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS   FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END   0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START  0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END    0xFFFF804000000000
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
+#endif
+
+#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ *  @which == SEGBASE_*  ;  @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS          0
+#define SEGBASE_GS_USER     1
+#define SEGBASE_GS_KERNEL   2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ *   RING0 -> RING3 kernel mode.
+ *   RING1 -> RING3 kernel mode.
+ *   RING2 -> RING3 kernel mode.
+ *   RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ *      orb   $3,1*8(%rsp)
+ *      iretq
+ * If flags contains VGCF_in_syscall:
+ *   Restore RAX, RIP, RFLAGS, RSP.
+ *   Discard R11, RCX, CS, SS.
+ * Otherwise:
+ *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall  (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL  VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+    /* Top of stack (%rsp at point of hypercall). */
+    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+    /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+    uint64_t r ## name, e ## name; \
+    uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+    uint64_t r15;
+    uint64_t r14;
+    uint64_t r13;
+    uint64_t r12;
+    __DECL_REG(bp);
+    __DECL_REG(bx);
+    uint64_t r11;
+    uint64_t r10;
+    uint64_t r9;
+    uint64_t r8;
+    __DECL_REG(ax);
+    __DECL_REG(cx);
+    __DECL_REG(dx);
+    __DECL_REG(si);
+    __DECL_REG(di);
+    uint32_t error_code;    /* private */
+    uint32_t entry_vector;  /* private */
+    __DECL_REG(ip);
+    uint16_t cs, _pad0[1];
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad1[3];
+    __DECL_REG(flags);      /* rflags.IF == !saved_upcall_mask */
+    __DECL_REG(sp);
+    uint16_t ss, _pad2[3];
+    uint16_t es, _pad3[3];
+    uint16_t ds, _pad4[3];
+    uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.     */
+    uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+typedef struct cpu_user_regs cpu_user_regs_t;
+DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+typedef struct arch_vcpu_info arch_vcpu_info_t;
+
+typedef unsigned long xen_callback_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/arch-x86/xen.h b/include/xen/interface/arch-x86/xen.h
new file mode 100644
index 0000000..691541a
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen.h
@@ -0,0 +1,210 @@
+/******************************************************************************
+ * arch-x86/xen.h
+ * 
+ * Guest OS interface to x86 Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#include "../xen.h"
+
+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_H__
+#define __XEN_PUBLIC_ARCH_X86_XEN_H__
+
+/* Structural guest handles introduced in 0x00030201. */
+#if __XEN_INTERFACE_VERSION__ >= 0x00030201
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
+    typedef struct { type *p; } __guest_handle_ ## name
+#else
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
+    typedef type * __guest_handle_ ## name
+#endif
+
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
+    ___DEFINE_XEN_GUEST_HANDLE(name, type);   \
+    ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
+#define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define __XEN_GUEST_HANDLE(name)        __guest_handle_ ## name
+#define XEN_GUEST_HANDLE(name)          __XEN_GUEST_HANDLE(name)
+#define set_xen_guest_handle_raw(hnd, val)  do { (hnd).p = val; } while (0)
+#ifdef __XEN_TOOLS__
+#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
+#endif
+#define set_xen_guest_handle(hnd, val) set_xen_guest_handle_raw(hnd, val)
+
+/* Allow co-existing Linux 2.6.23+ Xen interface definitions. */
+#define DEFINE_GUEST_HANDLE_STRUCT(name) struct name
+
+#if defined(__i386__)
+#include "xen-x86_32.h"
+#elif defined(__x86_64__)
+#include "xen-x86_64.h"
+#endif
+
+#ifndef __ASSEMBLY__
+typedef unsigned long xen_pfn_t;
+#define PRI_xen_pfn "lx"
+#endif
+
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ */
+#define FIRST_RESERVED_GDT_PAGE  14
+#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+
+/* Maximum number of virtual CPUs in legacy multi-processor guests. */
+#define XEN_LEGACY_MAX_VCPUS 32
+
+#ifndef __ASSEMBLY__
+
+typedef unsigned long xen_ulong_t;
+
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_set_trap_table(const struct trap_info traps[]);
+ * `
+ */
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ *  Level == 0: Noone may enter
+ *  Level == 1: Kernel may enter
+ *  Level == 2: Kernel may enter
+ *  Level == 3: Everyone may enter
+ */
+#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
+#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
+struct trap_info {
+    uint8_t       vector;  /* exception vector                              */
+    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
+    uint16_t      cs;      /* code selector                                 */
+    unsigned long address; /* code offset                                   */
+};
+typedef struct trap_info trap_info_t;
+DEFINE_XEN_GUEST_HANDLE(trap_info_t);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled 
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ */
+struct vcpu_guest_context {
+    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events  4
+#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online                   5
+#define VGCF_online                    (1<<_VGCF_online)
+    unsigned long flags;                    /* VGCF_* flags                 */
+    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
+    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
+    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
+    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
+    /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
+    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
+    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
+#ifdef __i386__
+    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
+    unsigned long failsafe_callback_eip;
+#else
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_eip;
+#ifdef __XEN__
+    union {
+        unsigned long syscall_callback_eip;
+        struct {
+            unsigned int event_callback_cs;    /* compat CS of event cb     */
+            unsigned int failsafe_callback_cs; /* compat CS of failsafe cb  */
+        };
+    };
+#else
+    unsigned long syscall_callback_eip;
+#endif
+#endif
+    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+    /* Segment base addresses. */
+    uint64_t      fs_base;
+    uint64_t      gs_base_kernel;
+    uint64_t      gs_base_user;
+#endif
+};
+typedef struct vcpu_guest_context vcpu_guest_context_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
+
+struct arch_shared_info {
+    unsigned long max_pfn;                  /* max pfn that appears in table */
+    /* Frame containing list of mfns containing list of mfns containing p2m. */
+    xen_pfn_t     pfn_to_mfn_frame_list_list;
+    unsigned long nmi_reason;
+    uint64_t pad[32];
+};
+typedef struct arch_shared_info arch_shared_info_t;
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * Prefix forces emulation of some non-trapping instructions.
+ * Currently only CPUID.
+ */
+#ifdef __ASSEMBLY__
+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
+#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
+#else
+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
+#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
+#endif
+
+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/arch-x86_32.h b/include/xen/interface/arch-x86_32.h
new file mode 100644
index 0000000..45842b2
--- /dev/null
+++ b/include/xen/interface/arch-x86_32.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * arch-x86_32.h
+ * 
+ * Guest OS interface to x86 32-bit Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#include "arch-x86/xen.h"
diff --git a/include/xen/interface/arch-x86_64.h b/include/xen/interface/arch-x86_64.h
new file mode 100644
index 0000000..fbb2639
--- /dev/null
+++ b/include/xen/interface/arch-x86_64.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * arch-x86_64.h
+ * 
+ * Guest OS interface to x86 64-bit Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#include "arch-x86/xen.h"
diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h
index 2ae3cd2..0323e51 100644
--- a/include/xen/interface/callback.h
+++ b/include/xen/interface/callback.h
@@ -86,6 +86,8 @@ struct callback_register {
 	uint16_t flags;
 	xen_callback_t address;
 };
+typedef struct callback_register callback_register_t;
+DEFINE_XEN_GUEST_HANDLE(callback_register_t);
 
 /*
  * Unregister a callback.
@@ -98,5 +100,12 @@ struct callback_unregister {
     uint16_t type;
     uint16_t _unused;
 };
+typedef struct callback_unregister callback_unregister_t;
+DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
+
+#if __XEN_INTERFACE_VERSION__ < 0x00030207
+#undef CALLBACKTYPE_sysenter
+#define CALLBACKTYPE_sysenter CALLBACKTYPE_sysenter_deprecated
+#endif
 
 #endif /* __XEN_PUBLIC_CALLBACK_H__ */
diff --git a/include/xen/interface/dom0_ops.h b/include/xen/interface/dom0_ops.h
new file mode 100644
index 0000000..5d2b324
--- /dev/null
+++ b/include/xen/interface/dom0_ops.h
@@ -0,0 +1,120 @@
+/******************************************************************************
+ * dom0_ops.h
+ * 
+ * Process command requests from domain-0 guest OS.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2003, B Dragovic
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_DOM0_OPS_H__
+#define __XEN_PUBLIC_DOM0_OPS_H__
+
+#include "xen.h"
+#include "platform.h"
+
+#if __XEN_INTERFACE_VERSION__ >= 0x00030204
+#error "dom0_ops.h is a compatibility interface only"
+#endif
+
+#define DOM0_INTERFACE_VERSION XENPF_INTERFACE_VERSION
+
+#define DOM0_SETTIME          XENPF_settime
+#define dom0_settime          xenpf_settime
+#define dom0_settime_t        xenpf_settime_t
+
+#define DOM0_ADD_MEMTYPE      XENPF_add_memtype
+#define dom0_add_memtype      xenpf_add_memtype
+#define dom0_add_memtype_t    xenpf_add_memtype_t
+
+#define DOM0_DEL_MEMTYPE      XENPF_del_memtype
+#define dom0_del_memtype      xenpf_del_memtype
+#define dom0_del_memtype_t    xenpf_del_memtype_t
+
+#define DOM0_READ_MEMTYPE     XENPF_read_memtype
+#define dom0_read_memtype     xenpf_read_memtype
+#define dom0_read_memtype_t   xenpf_read_memtype_t
+
+#define DOM0_MICROCODE        XENPF_microcode_update
+#define dom0_microcode        xenpf_microcode_update
+#define dom0_microcode_t      xenpf_microcode_update_t
+
+#define DOM0_PLATFORM_QUIRK   XENPF_platform_quirk
+#define dom0_platform_quirk   xenpf_platform_quirk
+#define dom0_platform_quirk_t xenpf_platform_quirk_t
+
+typedef uint64_t cpumap_t;
+
+/* Unsupported legacy operation -- defined for API compatibility. */
+#define DOM0_MSR                 15
+struct dom0_msr {
+    /* IN variables. */
+    uint32_t write;
+    cpumap_t cpu_mask;
+    uint32_t msr;
+    uint32_t in1;
+    uint32_t in2;
+    /* OUT variables. */
+    uint32_t out1;
+    uint32_t out2;
+};
+typedef struct dom0_msr dom0_msr_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_msr_t);
+
+/* Unsupported legacy operation -- defined for API compatibility. */
+#define DOM0_PHYSICAL_MEMORY_MAP 40
+struct dom0_memory_map_entry {
+    uint64_t start, end;
+    uint32_t flags; /* reserved */
+    uint8_t  is_ram;
+};
+typedef struct dom0_memory_map_entry dom0_memory_map_entry_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t);
+
+struct dom0_op {
+    uint32_t cmd;
+    uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
+    union {
+        struct dom0_msr               msr;
+        struct dom0_settime           settime;
+        struct dom0_add_memtype       add_memtype;
+        struct dom0_del_memtype       del_memtype;
+        struct dom0_read_memtype      read_memtype;
+        struct dom0_microcode         microcode;
+        struct dom0_platform_quirk    platform_quirk;
+        struct dom0_memory_map_entry  physical_memory_map;
+        uint8_t                       pad[128];
+    } u;
+};
+typedef struct dom0_op dom0_op_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_op_t);
+
+#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/domctl.h b/include/xen/interface/domctl.h
new file mode 100644
index 0000000..16f9fd2
--- /dev/null
+++ b/include/xen/interface/domctl.h
@@ -0,0 +1,1013 @@
+/******************************************************************************
+ * domctl.h
+ * 
+ * Domain management operations. For use by node control stack.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2003, B Dragovic
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_DOMCTL_H__
+#define __XEN_PUBLIC_DOMCTL_H__
+
+#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
+#error "domctl operations are intended for use by node control tools only"
+#endif
+
+#include "xen.h"
+#include "grant_table.h"
+
+#define XEN_DOMCTL_INTERFACE_VERSION 0x00000008
+
+/*
+ * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
+ * If it is specified as zero, an id is auto-allocated and returned.
+ */
+/* XEN_DOMCTL_createdomain */
+struct xen_domctl_createdomain {
+    /* IN parameters */
+    uint32_t ssidref;
+    xen_domain_handle_t handle;
+ /* Is this an HVM guest (as opposed to a PV guest)? */
+#define _XEN_DOMCTL_CDF_hvm_guest     0
+#define XEN_DOMCTL_CDF_hvm_guest      (1U<<_XEN_DOMCTL_CDF_hvm_guest)
+ /* Use hardware-assisted paging if available? */
+#define _XEN_DOMCTL_CDF_hap           1
+#define XEN_DOMCTL_CDF_hap            (1U<<_XEN_DOMCTL_CDF_hap)
+ /* Should domain memory integrity be verifed by tboot during Sx? */
+#define _XEN_DOMCTL_CDF_s3_integrity  2
+#define XEN_DOMCTL_CDF_s3_integrity   (1U<<_XEN_DOMCTL_CDF_s3_integrity)
+ /* Disable out-of-sync shadow page tables? */
+#define _XEN_DOMCTL_CDF_oos_off       3
+#define XEN_DOMCTL_CDF_oos_off        (1U<<_XEN_DOMCTL_CDF_oos_off)
+    uint32_t flags;
+};
+typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
+
+/* XEN_DOMCTL_getdomaininfo */
+struct xen_domctl_getdomaininfo {
+    /* OUT variables. */
+    domid_t  domain;              /* Also echoed in domctl.domain */
+ /* Domain is scheduled to die. */
+#define _XEN_DOMINF_dying     0
+#define XEN_DOMINF_dying      (1U<<_XEN_DOMINF_dying)
+ /* Domain is an HVM guest (as opposed to a PV guest). */
+#define _XEN_DOMINF_hvm_guest 1
+#define XEN_DOMINF_hvm_guest  (1U<<_XEN_DOMINF_hvm_guest)
+ /* The guest OS has shut down. */
+#define _XEN_DOMINF_shutdown  2
+#define XEN_DOMINF_shutdown   (1U<<_XEN_DOMINF_shutdown)
+ /* Currently paused by control software. */
+#define _XEN_DOMINF_paused    3
+#define XEN_DOMINF_paused     (1U<<_XEN_DOMINF_paused)
+ /* Currently blocked pending an event.     */
+#define _XEN_DOMINF_blocked   4
+#define XEN_DOMINF_blocked    (1U<<_XEN_DOMINF_blocked)
+ /* Domain is currently running.            */
+#define _XEN_DOMINF_running   5
+#define XEN_DOMINF_running    (1U<<_XEN_DOMINF_running)
+ /* Being debugged.  */
+#define _XEN_DOMINF_debugged  6
+#define XEN_DOMINF_debugged   (1U<<_XEN_DOMINF_debugged)
+ /* XEN_DOMINF_shutdown guest-supplied code.  */
+#define XEN_DOMINF_shutdownmask 255
+#define XEN_DOMINF_shutdownshift 16
+    uint32_t flags;              /* XEN_DOMINF_* */
+    uint64_aligned_t tot_pages;
+    uint64_aligned_t max_pages;
+    uint64_aligned_t shr_pages;
+    uint64_aligned_t paged_pages;
+    uint64_aligned_t shared_info_frame; /* GMFN of shared_info struct */
+    uint64_aligned_t cpu_time;
+    uint32_t nr_online_vcpus;    /* Number of VCPUs currently online. */
+    uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
+    uint32_t ssidref;
+    xen_domain_handle_t handle;
+    uint32_t cpupool;
+};
+typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
+
+
+/* XEN_DOMCTL_getmemlist */
+struct xen_domctl_getmemlist {
+    /* IN variables. */
+    /* Max entries to write to output buffer. */
+    uint64_aligned_t max_pfns;
+    /* Start index in guest's page list. */
+    uint64_aligned_t start_pfn;
+    XEN_GUEST_HANDLE_64(uint64) buffer;
+    /* OUT variables. */
+    uint64_aligned_t num_pfns;
+};
+typedef struct xen_domctl_getmemlist xen_domctl_getmemlist_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t);
+
+
+/* XEN_DOMCTL_getpageframeinfo */
+
+#define XEN_DOMCTL_PFINFO_LTAB_SHIFT 28
+#define XEN_DOMCTL_PFINFO_NOTAB   (0x0U<<28)
+#define XEN_DOMCTL_PFINFO_L1TAB   (0x1U<<28)
+#define XEN_DOMCTL_PFINFO_L2TAB   (0x2U<<28)
+#define XEN_DOMCTL_PFINFO_L3TAB   (0x3U<<28)
+#define XEN_DOMCTL_PFINFO_L4TAB   (0x4U<<28)
+#define XEN_DOMCTL_PFINFO_LTABTYPE_MASK (0x7U<<28)
+#define XEN_DOMCTL_PFINFO_LPINTAB (0x1U<<31)
+#define XEN_DOMCTL_PFINFO_XTAB    (0xfU<<28) /* invalid page */
+#define XEN_DOMCTL_PFINFO_XALLOC  (0xeU<<28) /* allocate-only page */
+#define XEN_DOMCTL_PFINFO_PAGEDTAB (0x8U<<28)
+#define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28)
+
+struct xen_domctl_getpageframeinfo {
+    /* IN variables. */
+    uint64_aligned_t gmfn; /* GMFN to query */
+    /* OUT variables. */
+    /* Is the page PINNED to a type? */
+    uint32_t type;         /* see above type defs */
+};
+typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t);
+
+
+/* XEN_DOMCTL_getpageframeinfo2 */
+struct xen_domctl_getpageframeinfo2 {
+    /* IN variables. */
+    uint64_aligned_t num;
+    /* IN/OUT variables. */
+    XEN_GUEST_HANDLE_64(uint32) array;
+};
+typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t);
+
+/* XEN_DOMCTL_getpageframeinfo3 */
+struct xen_domctl_getpageframeinfo3 {
+    /* IN variables. */
+    uint64_aligned_t num;
+    /* IN/OUT variables. */
+    XEN_GUEST_HANDLE_64(xen_pfn_t) array;
+};
+
+
+/*
+ * Control shadow pagetables operation
+ */
+/* XEN_DOMCTL_shadow_op */
+
+/* Disable shadow mode. */
+#define XEN_DOMCTL_SHADOW_OP_OFF         0
+
+/* Enable shadow mode (mode contains ORed XEN_DOMCTL_SHADOW_ENABLE_* flags). */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE      32
+
+/* Log-dirty bitmap operations. */
+ /* Return the bitmap and clean internal copy for next round. */
+#define XEN_DOMCTL_SHADOW_OP_CLEAN       11
+ /* Return the bitmap but do not modify internal copy. */
+#define XEN_DOMCTL_SHADOW_OP_PEEK        12
+
+/* Memory allocation accessors. */
+#define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION   30
+#define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION   31
+
+/* Legacy enable operations. */
+ /* Equiv. to ENABLE with no mode flags. */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE_TEST       1
+ /* Equiv. to ENABLE with mode flag ENABLE_LOG_DIRTY. */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY   2
+ /* Equiv. to ENABLE with mode flags ENABLE_REFCOUNT and ENABLE_TRANSLATE. */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE  3
+
+/* Mode flags for XEN_DOMCTL_SHADOW_OP_ENABLE. */
+ /*
+  * Shadow pagetables are refcounted: guest does not use explicit mmu
+  * operations nor write-protect its pagetables.
+  */
+#define XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT  (1 << 1)
+ /*
+  * Log pages in a bitmap as they are dirtied.
+  * Used for live relocation to determine which pages must be re-sent.
+  */
+#define XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY (1 << 2)
+ /*
+  * Automatically translate GPFNs into MFNs.
+  */
+#define XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE (1 << 3)
+ /*
+  * Xen does not steal virtual address space from the guest.
+  * Requires HVM support.
+  */
+#define XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL  (1 << 4)
+
+struct xen_domctl_shadow_op_stats {
+    uint32_t fault_count;
+    uint32_t dirty_count;
+};
+typedef struct xen_domctl_shadow_op_stats xen_domctl_shadow_op_stats_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_stats_t);
+
+struct xen_domctl_shadow_op {
+    /* IN variables. */
+    uint32_t       op;       /* XEN_DOMCTL_SHADOW_OP_* */
+
+    /* OP_ENABLE */
+    uint32_t       mode;     /* XEN_DOMCTL_SHADOW_ENABLE_* */
+
+    /* OP_GET_ALLOCATION / OP_SET_ALLOCATION */
+    uint32_t       mb;       /* Shadow memory allocation in MB */
+
+    /* OP_PEEK / OP_CLEAN */
+    XEN_GUEST_HANDLE_64(uint8) dirty_bitmap;
+    uint64_aligned_t pages; /* Size of buffer. Updated with actual size. */
+    struct xen_domctl_shadow_op_stats stats;
+};
+typedef struct xen_domctl_shadow_op xen_domctl_shadow_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_t);
+
+
+/* XEN_DOMCTL_max_mem */
+struct xen_domctl_max_mem {
+    /* IN variables. */
+    uint64_aligned_t max_memkb;
+};
+typedef struct xen_domctl_max_mem xen_domctl_max_mem_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_mem_t);
+
+
+/* XEN_DOMCTL_setvcpucontext */
+/* XEN_DOMCTL_getvcpucontext */
+struct xen_domctl_vcpucontext {
+    uint32_t              vcpu;                  /* IN */
+    XEN_GUEST_HANDLE_64(vcpu_guest_context_t) ctxt; /* IN/OUT */
+};
+typedef struct xen_domctl_vcpucontext xen_domctl_vcpucontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpucontext_t);
+
+
+/* XEN_DOMCTL_getvcpuinfo */
+struct xen_domctl_getvcpuinfo {
+    /* IN variables. */
+    uint32_t vcpu;
+    /* OUT variables. */
+    uint8_t  online;                  /* currently online (not hotplugged)? */
+    uint8_t  blocked;                 /* blocked waiting for an event? */
+    uint8_t  running;                 /* currently scheduled on its CPU? */
+    uint64_aligned_t cpu_time;        /* total cpu time consumed (ns) */
+    uint32_t cpu;                     /* current mapping   */
+};
+typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
+
+
+/* Get/set which physical cpus a vcpu can execute on. */
+/* XEN_DOMCTL_setvcpuaffinity */
+/* XEN_DOMCTL_getvcpuaffinity */
+struct xen_domctl_vcpuaffinity {
+    uint32_t  vcpu;              /* IN */
+    struct xenctl_cpumap cpumap; /* IN/OUT */
+};
+typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t);
+
+
+/* XEN_DOMCTL_max_vcpus */
+struct xen_domctl_max_vcpus {
+    uint32_t max;           /* maximum number of vcpus */
+};
+typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
+
+
+/* XEN_DOMCTL_scheduler_op */
+/* Scheduler types. */
+#define XEN_SCHEDULER_SEDF     4
+#define XEN_SCHEDULER_CREDIT   5
+#define XEN_SCHEDULER_CREDIT2  6
+#define XEN_SCHEDULER_ARINC653 7
+/* Set or get info? */
+#define XEN_DOMCTL_SCHEDOP_putinfo 0
+#define XEN_DOMCTL_SCHEDOP_getinfo 1
+struct xen_domctl_scheduler_op {
+    uint32_t sched_id;  /* XEN_SCHEDULER_* */
+    uint32_t cmd;       /* XEN_DOMCTL_SCHEDOP_* */
+    union {
+        struct xen_domctl_sched_sedf {
+            uint64_aligned_t period;
+            uint64_aligned_t slice;
+            uint64_aligned_t latency;
+            uint32_t extratime;
+            uint32_t weight;
+        } sedf;
+        struct xen_domctl_sched_credit {
+            uint16_t weight;
+            uint16_t cap;
+        } credit;
+        struct xen_domctl_sched_credit2 {
+            uint16_t weight;
+        } credit2;
+    } u;
+};
+typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_scheduler_op_t);
+
+
+/* XEN_DOMCTL_setdomainhandle */
+struct xen_domctl_setdomainhandle {
+    xen_domain_handle_t handle;
+};
+typedef struct xen_domctl_setdomainhandle xen_domctl_setdomainhandle_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdomainhandle_t);
+
+
+/* XEN_DOMCTL_setdebugging */
+struct xen_domctl_setdebugging {
+    uint8_t enable;
+};
+typedef struct xen_domctl_setdebugging xen_domctl_setdebugging_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdebugging_t);
+
+
+/* XEN_DOMCTL_irq_permission */
+struct xen_domctl_irq_permission {
+    uint8_t pirq;
+    uint8_t allow_access;    /* flag to specify enable/disable of IRQ access */
+};
+typedef struct xen_domctl_irq_permission xen_domctl_irq_permission_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_irq_permission_t);
+
+
+/* XEN_DOMCTL_iomem_permission */
+struct xen_domctl_iomem_permission {
+    uint64_aligned_t first_mfn;/* first page (physical page number) in range */
+    uint64_aligned_t nr_mfns;  /* number of pages in range (>0) */
+    uint8_t  allow_access;     /* allow (!0) or deny (0) access to range? */
+};
+typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_iomem_permission_t);
+
+
+/* XEN_DOMCTL_ioport_permission */
+struct xen_domctl_ioport_permission {
+    uint32_t first_port;              /* first port int range */
+    uint32_t nr_ports;                /* size of port range */
+    uint8_t  allow_access;            /* allow or deny access to range? */
+};
+typedef struct xen_domctl_ioport_permission xen_domctl_ioport_permission_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_permission_t);
+
+
+/* XEN_DOMCTL_hypercall_init */
+struct xen_domctl_hypercall_init {
+    uint64_aligned_t  gmfn;           /* GMFN to be initialised */
+};
+typedef struct xen_domctl_hypercall_init xen_domctl_hypercall_init_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t);
+
+
+/* XEN_DOMCTL_arch_setup */
+#define _XEN_DOMAINSETUP_hvm_guest 0
+#define XEN_DOMAINSETUP_hvm_guest  (1UL<<_XEN_DOMAINSETUP_hvm_guest)
+#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save)  */
+#define XEN_DOMAINSETUP_query  (1UL<<_XEN_DOMAINSETUP_query)
+#define _XEN_DOMAINSETUP_sioemu_guest 2
+#define XEN_DOMAINSETUP_sioemu_guest  (1UL<<_XEN_DOMAINSETUP_sioemu_guest)
+typedef struct xen_domctl_arch_setup {
+    uint64_aligned_t flags;  /* XEN_DOMAINSETUP_* */
+#ifdef __ia64__
+    uint64_aligned_t bp;     /* mpaddr of boot param area */
+    uint64_aligned_t maxmem; /* Highest memory address for MDT.  */
+    uint64_aligned_t xsi_va; /* Xen shared_info area virtual address.  */
+    uint32_t hypercall_imm;  /* Break imm for Xen hypercalls.  */
+    int8_t vhpt_size_log2;   /* Log2 of VHPT size. */
+#endif
+} xen_domctl_arch_setup_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t);
+
+
+/* XEN_DOMCTL_settimeoffset */
+struct xen_domctl_settimeoffset {
+    int32_t  time_offset_seconds; /* applied to domain wallclock time */
+};
+typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
+
+/* XEN_DOMCTL_gethvmcontext */
+/* XEN_DOMCTL_sethvmcontext */
+typedef struct xen_domctl_hvmcontext {
+    uint32_t size; /* IN/OUT: size of buffer / bytes filled */
+    XEN_GUEST_HANDLE_64(uint8) buffer; /* IN/OUT: data, or call
+                                        * gethvmcontext with NULL
+                                        * buffer to get size req'd */
+} xen_domctl_hvmcontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t);
+
+
+/* XEN_DOMCTL_set_address_size */
+/* XEN_DOMCTL_get_address_size */
+typedef struct xen_domctl_address_size {
+    uint32_t size;
+} xen_domctl_address_size_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_address_size_t);
+
+
+/* XEN_DOMCTL_real_mode_area */
+struct xen_domctl_real_mode_area {
+    uint32_t log; /* log2 of Real Mode Area size */
+};
+typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t);
+
+
+/* XEN_DOMCTL_sendtrigger */
+#define XEN_DOMCTL_SENDTRIGGER_NMI    0
+#define XEN_DOMCTL_SENDTRIGGER_RESET  1
+#define XEN_DOMCTL_SENDTRIGGER_INIT   2
+#define XEN_DOMCTL_SENDTRIGGER_POWER  3
+#define XEN_DOMCTL_SENDTRIGGER_SLEEP  4
+struct xen_domctl_sendtrigger {
+    uint32_t  trigger;  /* IN */
+    uint32_t  vcpu;     /* IN */
+};
+typedef struct xen_domctl_sendtrigger xen_domctl_sendtrigger_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t);
+
+
+/* Assign PCI device to HVM guest. Sets up IOMMU structures. */
+/* XEN_DOMCTL_assign_device */
+/* XEN_DOMCTL_test_assign_device */
+/* XEN_DOMCTL_deassign_device */
+struct xen_domctl_assign_device {
+    uint32_t  machine_sbdf;   /* machine PCI ID of assigned device */
+};
+typedef struct xen_domctl_assign_device xen_domctl_assign_device_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t);
+
+/* Retrieve sibling devices infomation of machine_sbdf */
+/* XEN_DOMCTL_get_device_group */
+struct xen_domctl_get_device_group {
+    uint32_t  machine_sbdf;     /* IN */
+    uint32_t  max_sdevs;        /* IN */
+    uint32_t  num_sdevs;        /* OUT */
+    XEN_GUEST_HANDLE_64(uint32)  sdev_array;   /* OUT */
+};
+typedef struct xen_domctl_get_device_group xen_domctl_get_device_group_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_get_device_group_t);
+
+/* Pass-through interrupts: bind real irq -> hvm devfn. */
+/* XEN_DOMCTL_bind_pt_irq */
+/* XEN_DOMCTL_unbind_pt_irq */
+typedef enum pt_irq_type_e {
+    PT_IRQ_TYPE_PCI,
+    PT_IRQ_TYPE_ISA,
+    PT_IRQ_TYPE_MSI,
+    PT_IRQ_TYPE_MSI_TRANSLATE,
+} pt_irq_type_t;
+struct xen_domctl_bind_pt_irq {
+    uint32_t machine_irq;
+    pt_irq_type_t irq_type;
+    uint32_t hvm_domid;
+
+    union {
+        struct {
+            uint8_t isa_irq;
+        } isa;
+        struct {
+            uint8_t bus;
+            uint8_t device;
+            uint8_t intx;
+        } pci;
+        struct {
+            uint8_t gvec;
+            uint32_t gflags;
+            uint64_aligned_t gtable;
+        } msi;
+    } u;
+};
+typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_pt_irq_t);
+
+
+/* Bind machine I/O address range -> HVM address range. */
+/* XEN_DOMCTL_memory_mapping */
+#define DPCI_ADD_MAPPING         1
+#define DPCI_REMOVE_MAPPING      0
+struct xen_domctl_memory_mapping {
+    uint64_aligned_t first_gfn; /* first page (hvm guest phys page) in range */
+    uint64_aligned_t first_mfn; /* first page (machine page) in range */
+    uint64_aligned_t nr_mfns;   /* number of pages in range (>0) */
+    uint32_t add_mapping;       /* add or remove mapping */
+    uint32_t padding;           /* padding for 64-bit aligned structure */
+};
+typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_memory_mapping_t);
+
+
+/* Bind machine I/O port range -> HVM I/O port range. */
+/* XEN_DOMCTL_ioport_mapping */
+struct xen_domctl_ioport_mapping {
+    uint32_t first_gport;     /* first guest IO port*/
+    uint32_t first_mport;     /* first machine IO port */
+    uint32_t nr_ports;        /* size of port range */
+    uint32_t add_mapping;     /* add or remove mapping */
+};
+typedef struct xen_domctl_ioport_mapping xen_domctl_ioport_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_mapping_t);
+
+
+/*
+ * Pin caching type of RAM space for x86 HVM domU.
+ */
+/* XEN_DOMCTL_pin_mem_cacheattr */
+/* Caching types: these happen to be the same as x86 MTRR/PAT type codes. */
+#define XEN_DOMCTL_MEM_CACHEATTR_UC  0
+#define XEN_DOMCTL_MEM_CACHEATTR_WC  1
+#define XEN_DOMCTL_MEM_CACHEATTR_WT  4
+#define XEN_DOMCTL_MEM_CACHEATTR_WP  5
+#define XEN_DOMCTL_MEM_CACHEATTR_WB  6
+#define XEN_DOMCTL_MEM_CACHEATTR_UCM 7
+struct xen_domctl_pin_mem_cacheattr {
+    uint64_aligned_t start, end;
+    uint32_t type; /* XEN_DOMCTL_MEM_CACHEATTR_* */
+};
+typedef struct xen_domctl_pin_mem_cacheattr xen_domctl_pin_mem_cacheattr_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
+
+
+/* XEN_DOMCTL_set_ext_vcpucontext */
+/* XEN_DOMCTL_get_ext_vcpucontext */
+struct xen_domctl_ext_vcpucontext {
+    /* IN: VCPU that this call applies to. */
+    uint32_t         vcpu;
+    /*
+     * SET: Size of struct (IN)
+     * GET: Size of struct (OUT)
+     */
+    uint32_t         size;
+#if defined(__i386__) || defined(__x86_64__)
+    /* SYSCALL from 32-bit mode and SYSENTER callback information. */
+    /* NB. SYSCALL from 64-bit mode is contained in vcpu_guest_context_t */
+    uint64_aligned_t syscall32_callback_eip;
+    uint64_aligned_t sysenter_callback_eip;
+    uint16_t         syscall32_callback_cs;
+    uint16_t         sysenter_callback_cs;
+    uint8_t          syscall32_disables_events;
+    uint8_t          sysenter_disables_events;
+#endif
+};
+typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpucontext_t);
+
+/*
+ * Set optimizaton features for a domain
+ */
+/* XEN_DOMCTL_set_opt_feature */
+struct xen_domctl_set_opt_feature {
+#if defined(__ia64__)
+    struct xen_ia64_opt_feature optf;
+#else
+    /* Make struct non-empty: do not depend on this field name! */
+    uint64_t dummy;
+#endif
+};
+typedef struct xen_domctl_set_opt_feature xen_domctl_set_opt_feature_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_opt_feature_t);
+
+/*
+ * Set the target domain for a domain
+ */
+/* XEN_DOMCTL_set_target */
+struct xen_domctl_set_target {
+    domid_t target;
+};
+typedef struct xen_domctl_set_target xen_domctl_set_target_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_target_t);
+
+#if defined(__i386__) || defined(__x86_64__)
+# define XEN_CPUID_INPUT_UNUSED  0xFFFFFFFF
+/* XEN_DOMCTL_set_cpuid */
+struct xen_domctl_cpuid {
+  uint32_t input[2];
+  uint32_t eax;
+  uint32_t ebx;
+  uint32_t ecx;
+  uint32_t edx;
+};
+typedef struct xen_domctl_cpuid xen_domctl_cpuid_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpuid_t);
+#endif
+
+/* XEN_DOMCTL_subscribe */
+struct xen_domctl_subscribe {
+    uint32_t port; /* IN */
+};
+typedef struct xen_domctl_subscribe xen_domctl_subscribe_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_subscribe_t);
+
+/*
+ * Define the maximum machine address size which should be allocated
+ * to a guest.
+ */
+/* XEN_DOMCTL_set_machine_address_size */
+/* XEN_DOMCTL_get_machine_address_size */
+
+/*
+ * Do not inject spurious page faults into this domain.
+ */
+/* XEN_DOMCTL_suppress_spurious_page_faults */
+
+/* XEN_DOMCTL_debug_op */
+#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF         0
+#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON          1
+struct xen_domctl_debug_op {
+    uint32_t op;   /* IN */
+    uint32_t vcpu; /* IN */
+};
+typedef struct xen_domctl_debug_op xen_domctl_debug_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_debug_op_t);
+
+/*
+ * Request a particular record from the HVM context
+ */
+/* XEN_DOMCTL_gethvmcontext_partial */
+typedef struct xen_domctl_hvmcontext_partial {
+    uint32_t type;                      /* IN: Type of record required */
+    uint32_t instance;                  /* IN: Instance of that type */
+    XEN_GUEST_HANDLE_64(uint8) buffer;  /* OUT: buffer to write record into */
+} xen_domctl_hvmcontext_partial_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
+
+/* XEN_DOMCTL_disable_migrate */
+typedef struct xen_domctl_disable_migrate {
+    uint32_t disable; /* IN: 1: disable migration and restore */
+} xen_domctl_disable_migrate_t;
+
+
+/* XEN_DOMCTL_gettscinfo */
+/* XEN_DOMCTL_settscinfo */
+struct xen_guest_tsc_info {
+    uint32_t tsc_mode;
+    uint32_t gtsc_khz;
+    uint32_t incarnation;
+    uint32_t pad;
+    uint64_aligned_t elapsed_nsec;
+};
+typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
+typedef struct xen_domctl_tsc_info {
+    XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
+    xen_guest_tsc_info_t info; /* IN */
+} xen_domctl_tsc_info_t;
+
+/* XEN_DOMCTL_gdbsx_guestmemio      guest mem io */
+struct xen_domctl_gdbsx_memio {
+    /* IN */
+    uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */
+    uint64_aligned_t gva;    /* guest virtual address */
+    uint64_aligned_t uva;    /* user buffer virtual address */
+    uint32_t         len;    /* number of bytes to read/write */
+    uint8_t          gwr;    /* 0 = read from guest. 1 = write to guest */
+    /* OUT */
+    uint32_t         remain; /* bytes remaining to be copied */
+};
+
+/* XEN_DOMCTL_gdbsx_pausevcpu */
+/* XEN_DOMCTL_gdbsx_unpausevcpu */
+struct xen_domctl_gdbsx_pauseunp_vcpu { /* pause/unpause a vcpu */
+    uint32_t         vcpu;         /* which vcpu */
+};
+
+/* XEN_DOMCTL_gdbsx_domstatus */
+struct xen_domctl_gdbsx_domstatus {
+    /* OUT */
+    uint8_t          paused;     /* is the domain paused */
+    uint32_t         vcpu_id;    /* any vcpu in an event? */
+    uint32_t         vcpu_ev;    /* if yes, what event? */
+};
+
+/*
+ * Memory event operations
+ */
+
+/* XEN_DOMCTL_mem_event_op */
+
+/*
+* Domain memory paging
+ * Page memory in and out. 
+ */
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING            1
+
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE     0
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE    1
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_NOMINATE   2
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_EVICT      3
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_PREP       4
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_RESUME     5
+
+/*
+ * Access permissions.
+ *
+ * There are HVM hypercalls to set the per-page access permissions of every
+ * page in a domain.  When one of these permissions--independent, read, 
+ * write, and execute--is violated, the VCPU is paused and a memory event 
+ * is sent with what happened.  (See public/mem_event.h)  The memory event 
+ * handler can then resume the VCPU and redo the access with an 
+ * ACCESS_RESUME mode for the following domctl.
+ */
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS            2
+
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE     0
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE    1
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_RESUME     2
+
+struct xen_domctl_mem_event_op {
+    uint32_t       op;           /* XEN_DOMCTL_MEM_EVENT_OP_*_* */
+    uint32_t       mode;         /* XEN_DOMCTL_MEM_EVENT_OP_* */
+
+    union {
+        /* OP_ENABLE IN:  Virtual address of shared page */
+        uint64_aligned_t shared_addr;  
+        /* PAGING_PREP IN: buffer to immediately fill page in */
+        uint64_aligned_t buffer;
+    } u;
+    uint64_aligned_t ring_addr;    /* IN:  Virtual address of ring page */
+
+    /* Other OPs */
+    uint64_aligned_t gfn;          /* IN:  gfn of page being operated on */
+};
+typedef struct xen_domctl_mem_event_op xen_domctl_mem_event_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_event_op_t);
+
+/*
+ * Memory sharing operations
+ */
+/* XEN_DOMCTL_mem_sharing_op */
+
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING                3
+
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_CONTROL        0
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_NOMINATE_GFN   1
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_NOMINATE_GREF  2
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_SHARE          3
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_RESUME         4
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_DEBUG_GFN      5
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_DEBUG_MFN      6
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_DEBUG_GREF     7
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_ADD_PHYSMAP    8
+
+#define XEN_DOMCTL_MEM_SHARING_S_HANDLE_INVALID  (-10)
+#define XEN_DOMCTL_MEM_SHARING_C_HANDLE_INVALID  (-9)
+
+/* The following allows sharing of grant refs. This is useful
+ * for sharing utilities sitting as "filters" in IO backends
+ * (e.g. memshr + blktap(2)). The IO backend is only exposed 
+ * to grant references, and this allows sharing of the grefs */
+#define XEN_DOMCTL_MEM_SHARING_FIELD_IS_GREF_FLAG   (1ULL << 62)
+
+#define XEN_DOMCTL_MEM_SHARING_FIELD_MAKE_GREF(field, val)  \
+    (field) = (XEN_DOMCTL_MEM_SHARING_FIELD_IS_GREF_FLAG | val)
+#define XEN_DOMCTL_MEM_SHARING_FIELD_IS_GREF(field)         \
+    ((field) & XEN_DOMCTL_MEM_SHARING_FIELD_IS_GREF_FLAG)
+#define XEN_DOMCTL_MEM_SHARING_FIELD_GET_GREF(field)        \
+    ((field) & (~XEN_DOMCTL_MEM_SHARING_FIELD_IS_GREF_FLAG))
+
+struct xen_domctl_mem_sharing_op {
+    uint8_t op; /* XEN_DOMCTL_MEM_EVENT_OP_* */
+
+    union {
+        uint8_t enable;                   /* OP_CONTROL                */
+
+        struct mem_sharing_op_nominate {  /* OP_NOMINATE_xxx           */
+            union {
+                uint64_aligned_t gfn;     /* IN: gfn to nominate       */
+                uint32_t      grant_ref;  /* IN: grant ref to nominate */
+            } u;
+            uint64_aligned_t  handle;     /* OUT: the handle           */
+        } nominate;
+        struct mem_sharing_op_share {     /* OP_SHARE/ADD_PHYSMAP */
+            uint64_aligned_t source_gfn;    /* IN: the gfn of the source page */
+            uint64_aligned_t source_handle; /* IN: handle to the source page */
+            domid_t          client_domain; /* IN: the client domain id */
+            uint64_aligned_t client_gfn;    /* IN: the client gfn */
+            uint64_aligned_t client_handle; /* IN: handle to the client page */
+        } share; 
+        struct mem_sharing_op_debug {     /* OP_DEBUG_xxx */
+            union {
+                uint64_aligned_t gfn;      /* IN: gfn to debug          */
+                uint64_aligned_t mfn;      /* IN: mfn to debug          */
+                grant_ref_t    gref;       /* IN: gref to debug         */
+            } u;
+        } debug;
+    } u;
+};
+typedef struct xen_domctl_mem_sharing_op xen_domctl_mem_sharing_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_sharing_op_t);
+
+struct xen_domctl_audit_p2m {
+    /* OUT error counts */
+    uint64_t orphans;
+    uint64_t m2p_bad;
+    uint64_t p2m_bad;
+};
+typedef struct xen_domctl_audit_p2m xen_domctl_audit_p2m_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_audit_p2m_t);
+
+struct xen_domctl_set_virq_handler {
+    uint32_t virq; /* IN */
+};
+typedef struct xen_domctl_set_virq_handler xen_domctl_set_virq_handler_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_virq_handler_t);
+
+#if defined(__i386__) || defined(__x86_64__)
+/* XEN_DOMCTL_setvcpuextstate */
+/* XEN_DOMCTL_getvcpuextstate */
+struct xen_domctl_vcpuextstate {
+    /* IN: VCPU that this call applies to. */
+    uint32_t         vcpu;
+    /*
+     * SET: xfeature support mask of struct (IN)
+     * GET: xfeature support mask of struct (IN/OUT)
+     * xfeature mask is served as identifications of the saving format
+     * so that compatible CPUs can have a check on format to decide
+     * whether it can restore.
+     */
+    uint64_aligned_t         xfeature_mask;
+    /*
+     * SET: Size of struct (IN)
+     * GET: Size of struct (IN/OUT)
+     */
+    uint64_aligned_t         size;
+    XEN_GUEST_HANDLE_64(uint64) buffer;
+};
+typedef struct xen_domctl_vcpuextstate xen_domctl_vcpuextstate_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuextstate_t);
+#endif
+
+/* XEN_DOMCTL_set_access_required: sets whether a memory event listener
+ * must be present to handle page access events: if false, the page
+ * access will revert to full permissions if no one is listening;
+ *  */
+struct xen_domctl_set_access_required {
+    uint8_t access_required;
+};
+typedef struct xen_domctl_set_access_required xen_domctl_set_access_required_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_access_required_t);
+
+struct xen_domctl {
+    uint32_t cmd;
+#define XEN_DOMCTL_createdomain                   1
+#define XEN_DOMCTL_destroydomain                  2
+#define XEN_DOMCTL_pausedomain                    3
+#define XEN_DOMCTL_unpausedomain                  4
+#define XEN_DOMCTL_getdomaininfo                  5
+#define XEN_DOMCTL_getmemlist                     6
+#define XEN_DOMCTL_getpageframeinfo               7
+#define XEN_DOMCTL_getpageframeinfo2              8
+#define XEN_DOMCTL_setvcpuaffinity                9
+#define XEN_DOMCTL_shadow_op                     10
+#define XEN_DOMCTL_max_mem                       11
+#define XEN_DOMCTL_setvcpucontext                12
+#define XEN_DOMCTL_getvcpucontext                13
+#define XEN_DOMCTL_getvcpuinfo                   14
+#define XEN_DOMCTL_max_vcpus                     15
+#define XEN_DOMCTL_scheduler_op                  16
+#define XEN_DOMCTL_setdomainhandle               17
+#define XEN_DOMCTL_setdebugging                  18
+#define XEN_DOMCTL_irq_permission                19
+#define XEN_DOMCTL_iomem_permission              20
+#define XEN_DOMCTL_ioport_permission             21
+#define XEN_DOMCTL_hypercall_init                22
+#define XEN_DOMCTL_arch_setup                    23
+#define XEN_DOMCTL_settimeoffset                 24
+#define XEN_DOMCTL_getvcpuaffinity               25
+#define XEN_DOMCTL_real_mode_area                26
+#define XEN_DOMCTL_resumedomain                  27
+#define XEN_DOMCTL_sendtrigger                   28
+#define XEN_DOMCTL_subscribe                     29
+#define XEN_DOMCTL_gethvmcontext                 33
+#define XEN_DOMCTL_sethvmcontext                 34
+#define XEN_DOMCTL_set_address_size              35
+#define XEN_DOMCTL_get_address_size              36
+#define XEN_DOMCTL_assign_device                 37
+#define XEN_DOMCTL_bind_pt_irq                   38
+#define XEN_DOMCTL_memory_mapping                39
+#define XEN_DOMCTL_ioport_mapping                40
+#define XEN_DOMCTL_pin_mem_cacheattr             41
+#define XEN_DOMCTL_set_ext_vcpucontext           42
+#define XEN_DOMCTL_get_ext_vcpucontext           43
+#define XEN_DOMCTL_set_opt_feature               44
+#define XEN_DOMCTL_test_assign_device            45
+#define XEN_DOMCTL_set_target                    46
+#define XEN_DOMCTL_deassign_device               47
+#define XEN_DOMCTL_unbind_pt_irq                 48
+#define XEN_DOMCTL_set_cpuid                     49
+#define XEN_DOMCTL_get_device_group              50
+#define XEN_DOMCTL_set_machine_address_size      51
+#define XEN_DOMCTL_get_machine_address_size      52
+#define XEN_DOMCTL_suppress_spurious_page_faults 53
+#define XEN_DOMCTL_debug_op                      54
+#define XEN_DOMCTL_gethvmcontext_partial         55
+#define XEN_DOMCTL_mem_event_op                  56
+#define XEN_DOMCTL_mem_sharing_op                57
+#define XEN_DOMCTL_disable_migrate               58
+#define XEN_DOMCTL_gettscinfo                    59
+#define XEN_DOMCTL_settscinfo                    60
+#define XEN_DOMCTL_getpageframeinfo3             61
+#define XEN_DOMCTL_setvcpuextstate               62
+#define XEN_DOMCTL_getvcpuextstate               63
+#define XEN_DOMCTL_set_access_required           64
+#define XEN_DOMCTL_audit_p2m                     65
+#define XEN_DOMCTL_set_virq_handler              66
+#define XEN_DOMCTL_gdbsx_guestmemio            1000
+#define XEN_DOMCTL_gdbsx_pausevcpu             1001
+#define XEN_DOMCTL_gdbsx_unpausevcpu           1002
+#define XEN_DOMCTL_gdbsx_domstatus             1003
+    uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
+    domid_t  domain;
+    union {
+        struct xen_domctl_createdomain      createdomain;
+        struct xen_domctl_getdomaininfo     getdomaininfo;
+        struct xen_domctl_getmemlist        getmemlist;
+        struct xen_domctl_getpageframeinfo  getpageframeinfo;
+        struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
+        struct xen_domctl_getpageframeinfo3 getpageframeinfo3;
+        struct xen_domctl_vcpuaffinity      vcpuaffinity;
+        struct xen_domctl_shadow_op         shadow_op;
+        struct xen_domctl_max_mem           max_mem;
+        struct xen_domctl_vcpucontext       vcpucontext;
+        struct xen_domctl_getvcpuinfo       getvcpuinfo;
+        struct xen_domctl_max_vcpus         max_vcpus;
+        struct xen_domctl_scheduler_op      scheduler_op;
+        struct xen_domctl_setdomainhandle   setdomainhandle;
+        struct xen_domctl_setdebugging      setdebugging;
+        struct xen_domctl_irq_permission    irq_permission;
+        struct xen_domctl_iomem_permission  iomem_permission;
+        struct xen_domctl_ioport_permission ioport_permission;
+        struct xen_domctl_hypercall_init    hypercall_init;
+        struct xen_domctl_arch_setup        arch_setup;
+        struct xen_domctl_settimeoffset     settimeoffset;
+        struct xen_domctl_disable_migrate   disable_migrate;
+        struct xen_domctl_tsc_info          tsc_info;
+        struct xen_domctl_real_mode_area    real_mode_area;
+        struct xen_domctl_hvmcontext        hvmcontext;
+        struct xen_domctl_hvmcontext_partial hvmcontext_partial;
+        struct xen_domctl_address_size      address_size;
+        struct xen_domctl_sendtrigger       sendtrigger;
+        struct xen_domctl_get_device_group  get_device_group;
+        struct xen_domctl_assign_device     assign_device;
+        struct xen_domctl_bind_pt_irq       bind_pt_irq;
+        struct xen_domctl_memory_mapping    memory_mapping;
+        struct xen_domctl_ioport_mapping    ioport_mapping;
+        struct xen_domctl_pin_mem_cacheattr pin_mem_cacheattr;
+        struct xen_domctl_ext_vcpucontext   ext_vcpucontext;
+        struct xen_domctl_set_opt_feature   set_opt_feature;
+        struct xen_domctl_set_target        set_target;
+        struct xen_domctl_subscribe         subscribe;
+        struct xen_domctl_debug_op          debug_op;
+        struct xen_domctl_mem_event_op      mem_event_op;
+        struct xen_domctl_mem_sharing_op    mem_sharing_op;
+#if defined(__i386__) || defined(__x86_64__)
+        struct xen_domctl_cpuid             cpuid;
+        struct xen_domctl_vcpuextstate      vcpuextstate;
+#endif
+        struct xen_domctl_set_access_required access_required;
+        struct xen_domctl_audit_p2m         audit_p2m;
+        struct xen_domctl_set_virq_handler  set_virq_handler;
+        struct xen_domctl_gdbsx_memio       gdbsx_guest_memio;
+        struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
+        struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
+        uint8_t                             pad[128];
+    } u;
+};
+typedef struct xen_domctl xen_domctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_t);
+
+#endif /* __XEN_PUBLIC_DOMCTL_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h
index 0360b15..08222ee 100644
--- a/include/xen/interface/elfnote.h
+++ b/include/xen/interface/elfnote.h
@@ -3,6 +3,24 @@
  *
  * Definitions used for the Xen ELF notes.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
  */
 
@@ -10,7 +28,7 @@
 #define __XEN_PUBLIC_ELFNOTE_H__
 
 /*
- * The notes should live in a SHT_NOTE segment and have "Xen" in the
+ * The notes should live in a PT_NOTE segment and have "Xen" in the
  * name field.
  *
  * Numeric types are either 4 or 8 bytes depending on the content of
@@ -22,8 +40,6 @@
 
 /*
  * NAME=VALUE pair (string).
- *
- * LEGACY: FEATURES and PAE
  */
 #define XEN_ELFNOTE_INFO           0
 
@@ -90,7 +106,12 @@
 #define XEN_ELFNOTE_LOADER         8
 
 /*
- * The kernel supports PAE (x86/32 only, string = "yes" or "no").
+ * The kernel supports PAE (x86/32 only, string = "yes", "no" or
+ * "bimodal").
+ *
+ * For compatibility with Xen 3.0.3 and earlier the "bimodal" setting
+ * may be given as "yes,bimodal" which will cause older Xen to treat
+ * this kernel as PAE.
  *
  * LEGACY: PAE (n.b. The legacy interface included a provision to
  * indicate 'extended-cr3' support allowing L3 page tables to be
@@ -140,6 +161,95 @@
  */
 #define XEN_ELFNOTE_SUSPEND_CANCEL 14
 
+/*
+ * The (non-default) location the initial phys-to-machine map should be
+ * placed at by the hypervisor (Dom0) or the tools (DomU).
+ * The kernel must be prepared for this mapping to be established using
+ * large pages, despite such otherwise not being available to guests.
+ * The kernel must also be able to handle the page table pages used for
+ * this mapping not being accessible through the initial mapping.
+ * (Only x86-64 supports this at present.)
+ */
+#define XEN_ELFNOTE_INIT_P2M      15
+
+/*
+ * Whether or not the guest can deal with being passed an initrd not
+ * mapped through its initial page tables.
+ */
+#define XEN_ELFNOTE_MOD_START_PFN 16
+
+/*
+ * The features supported by this kernel (numeric).
+ *
+ * Other than XEN_ELFNOTE_FEATURES on pre-4.2 Xen, this note allows a
+ * kernel to specify support for features that older hypervisors don't
+ * know about. The set of features 4.2 and newer hypervisors will
+ * consider supported by the kernel is the combination of the sets
+ * specified through this and the string note.
+ *
+ * LEGACY: FEATURES
+ */
+#define XEN_ELFNOTE_SUPPORTED_FEATURES 17
+
+/*
+ * The number of the highest elfnote defined.
+ */
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES
+
+/*
+ * System information exported through crash notes.
+ *
+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO
+ * note in case of a system crash. This note will contain various
+ * information about the system, see xen/include/xen/elfcore.h.
+ */
+#define XEN_ELFNOTE_CRASH_INFO 0x1000001
+
+/*
+ * System registers exported through crash notes.
+ *
+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS
+ * note per cpu in case of a system crash. This note is architecture
+ * specific and will contain registers not saved in the "CORE" note.
+ * See xen/include/xen/elfcore.h for more information.
+ */
+#define XEN_ELFNOTE_CRASH_REGS 0x1000002
+
+
+/*
+ * xen dump-core none note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_NONE
+ * in its dump file to indicate that the file is xen dump-core
+ * file. This note doesn't have any other information.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_NONE               0x2000000
+
+/*
+ * xen dump-core header note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_HEADER
+ * in its dump file.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_HEADER             0x2000001
+
+/*
+ * xen dump-core xen version note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_XEN_VERSION
+ * in its dump file. It contains the xen version obtained via the
+ * XENVER hypercall.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_XEN_VERSION        0x2000002
+
+/*
+ * xen dump-core format version note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION
+ * in its dump file. It contains a format version identifier.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION     0x2000003
+
 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
 
 /*
diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h
index 2090881..5340322 100644
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -3,6 +3,24 @@
  *
  * Event channels between domains.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2003-2004, K A Fraser.
  */
 
@@ -11,8 +29,15 @@
 
 #include <xen/interface/xen.h>
 
+/*
+ * Prototype for this hypercall is:
+ *  int event_channel_op(int cmd, void *args)
+ * @cmd  == EVTCHNOP_??? (event-channel operation).
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
 typedef uint32_t evtchn_port_t;
-DEFINE_GUEST_HANDLE(evtchn_port_t);
+DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
 
 /*
  * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
@@ -29,6 +54,7 @@ struct evtchn_alloc_unbound {
 	/* OUT parameters */
 	evtchn_port_t port;
 };
+typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t;
 
 /*
  * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
@@ -47,14 +73,19 @@ struct evtchn_bind_interdomain {
 	/* OUT parameters. */
 	evtchn_port_t local_port;
 };
+typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t;
 
 /*
  * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
  * vcpu.
  * NOTES:
- *  1. A virtual IRQ may be bound to at most one event channel per vcpu.
- *  2. The allocated event channel is bound to the specified vcpu. The binding
- *     may not be changed.
+ *  1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list
+ *     in xen.h for the classification of each VIRQ.
+ *  2. Global VIRQs must be allocated on VCPU0 but can subsequently be
+ *     re-bound via EVTCHNOP_bind_vcpu.
+ *  3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu.
+ *     The allocated event channel is bound to the specified vcpu and the
+ *     binding cannot be changed.
  */
 #define EVTCHNOP_bind_virq	  1
 struct evtchn_bind_virq {
@@ -64,6 +95,7 @@ struct evtchn_bind_virq {
 	/* OUT parameters. */
 	evtchn_port_t port;
 };
+typedef struct evtchn_bind_virq evtchn_bind_virq_t;
 
 /*
  * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
@@ -80,6 +112,7 @@ struct evtchn_bind_pirq {
 	/* OUT parameters. */
 	evtchn_port_t port;
 };
+typedef struct evtchn_bind_pirq evtchn_bind_pirq_t;
 
 /*
  * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
@@ -93,6 +126,7 @@ struct evtchn_bind_ipi {
 	/* OUT parameters. */
 	evtchn_port_t port;
 };
+typedef struct evtchn_bind_ipi evtchn_bind_ipi_t;
 
 /*
  * EVTCHNOP_close: Close a local event channel <port>. If the channel is
@@ -104,6 +138,7 @@ struct evtchn_close {
 	/* IN parameters. */
 	evtchn_port_t port;
 };
+typedef struct evtchn_close evtchn_close_t;
 
 /*
  * EVTCHNOP_send: Send an event to the remote end of the channel whose local
@@ -114,6 +149,7 @@ struct evtchn_send {
 	/* IN parameters. */
 	evtchn_port_t port;
 };
+typedef struct evtchn_send evtchn_send_t;
 
 /*
  * EVTCHNOP_status: Get the current status of the communication channel which
@@ -149,14 +185,17 @@ struct evtchn_status {
 		uint32_t virq;	    /* EVTCHNSTAT_virq	      */
 	} u;
 };
+typedef struct evtchn_status evtchn_status_t;
 
 /*
  * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
  * event is pending.
  * NOTES:
- *  1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
- *     the binding. This binding cannot be changed.
- *  2. All other channels notify vcpu0 by default. This default is set when
+ *  1. IPI-bound channels always notify the vcpu specified at bind time.
+ *     This binding cannot be changed.
+ *  2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time.
+ *     This binding cannot be changed.
+ *  3. All other channels notify vcpu0 by default. This default is set when
  *     the channel is allocated (a port that is freed and subsequently reused
  *     has its binding reset to vcpu0).
  */
@@ -166,6 +205,7 @@ struct evtchn_bind_vcpu {
 	evtchn_port_t port;
 	uint32_t vcpu;
 };
+typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t;
 
 /*
  * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
@@ -176,7 +216,25 @@ struct evtchn_unmask {
 	/* IN parameters. */
 	evtchn_port_t port;
 };
+typedef struct evtchn_unmask evtchn_unmask_t;
 
+/*
+ * EVTCHNOP_reset: Close all event channels associated with specified domain.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify other than DOMID_SELF.
+ */
+#define EVTCHNOP_reset           10
+struct evtchn_reset {
+    /* IN parameters. */
+    domid_t dom;
+};
+typedef struct evtchn_reset evtchn_reset_t;
+
+/*
+ * Argument to event_channel_op_compat() hypercall. Superceded by new
+ * event_channel_op() hypercall since 0x00030202.
+ */
 struct evtchn_op {
 	uint32_t cmd; /* EVTCHNOP_* */
 	union {
@@ -193,5 +251,7 @@ struct evtchn_op {
 	} u;
 };
 DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
+typedef struct evtchn_op evtchn_op_t;
+DEFINE_XEN_GUEST_HANDLE(evtchn_op_t);
 
 #endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h
index b6ca39a..d8fad89 100644
--- a/include/xen/interface/features.h
+++ b/include/xen/interface/features.h
@@ -3,6 +3,24 @@
  *
  * Feature flags, reported by XENVER_get_features.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
  */
 
@@ -41,6 +59,15 @@
 /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
 #define XENFEAT_mmu_pt_update_preserve_ad  5
 
+/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */
+#define XENFEAT_highmem_assist             6
+
+/*
+ * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel
+ * available pte bits.
+ */
+#define XENFEAT_gnttab_map_avail_bits      7
+
 /* x86: Does this Xen host support the HVM callback vector type? */
 #define XENFEAT_hvm_callback_vector        8
 
@@ -48,7 +75,10 @@
 #define XENFEAT_hvm_safe_pvclock           9
 
 /* x86: pirq can be used by HVM guests */
-#define XENFEAT_hvm_pirqs           10
+#define XENFEAT_hvm_pirqs                 10
+
+/* operation as Dom0 is supported */
+#define XENFEAT_dom0                      11
 
 #define XENFEAT_NR_SUBMAPS 1
 
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
index a17d844..f54bfba 100644
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -100,6 +100,12 @@ typedef uint32_t grant_ref_t;
  * Version 1 of the grant table entry structure is maintained purely
  * for backwards compatibility.  New guests should use version 2.
  */
+#if defined(CONFIG_PARAVIRT_XEN)
+#define grant_entry grant_entry_v1
+#elif __XEN_INTERFACE_VERSION__ < 0x0003020a
+#define grant_entry_v1 grant_entry
+#define grant_entry_v1_t grant_entry_t
+#endif
 struct grant_entry_v1 {
     /* GTF_xxx: various type and flag information.  [XEN,GST] */
     uint16_t flags;
@@ -111,6 +117,14 @@ struct grant_entry_v1 {
      */
     uint32_t frame;
 };
+typedef struct grant_entry_v1 grant_entry_v1_t;
+
+/* The first few grant table entries will be preserved across grant table
+ * version changes and may be pre-populated at domain creation by tools.
+ */
+#define GNTTAB_NR_RESERVED_ENTRIES     8
+#define GNTTAB_RESERVED_CONSOLE        0
+#define GNTTAB_RESERVED_XENSTORE       1
 
 /*
  * Type of grant entry.
@@ -132,6 +146,7 @@ struct grant_entry_v1 {
  *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
  *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
  *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
+ *  GTF_PAT, GTF_PWT, GTF_PCD: (x86) cache attribute flags for the grant [GST]
  *  GTF_sub_page: Grant access to only a subrange of the page.  @domid
  *                will only be allowed to copy from the grant, and not
  *                map it. [GST]
@@ -142,6 +157,12 @@ struct grant_entry_v1 {
 #define GTF_reading         (1U<<_GTF_reading)
 #define _GTF_writing        (4)
 #define GTF_writing         (1U<<_GTF_writing)
+#define _GTF_PWT            (5)
+#define GTF_PWT             (1U<<_GTF_PWT)
+#define _GTF_PCD            (6)
+#define GTF_PCD             (1U<<_GTF_PCD)
+#define _GTF_PAT            (7)
+#define GTF_PAT             (1U<<_GTF_PAT)
 #define _GTF_sub_page       (8)
 #define GTF_sub_page        (1U<<_GTF_sub_page)
 
@@ -169,7 +190,7 @@ struct grant_entry_v1 {
  * The interface by which domains use grant references does not depend
  * on the grant table version in use by the other domain.
  */
-
+#if defined(CONFIG_PARAVIRT_XEN) || __XEN_INTERFACE_VERSION__ >= 0x0003020a
 /*
  * Version 1 and version 2 grant entries share a common prefix.  The
  * fields of the prefix are documented as part of struct
@@ -179,10 +200,11 @@ struct grant_entry_header {
     uint16_t flags;
     domid_t  domid;
 };
+typedef struct grant_entry_header grant_entry_header_t;
 
 /*
- * Version 2 of the grant entry structure, here is an union because three
- * different types are suppotted: full_page, sub_page and transitive.
+ * Version 2 of the grant entry structure, here is a union because three
+ * different types are supported: full_page, sub_page and transitive.
  */
 union grant_entry_v2 {
     struct grant_entry_header hdr;
@@ -219,6 +241,9 @@ union grant_entry_v2 {
      * grant @gref in domain @trans_domid, as if it was the local
      * domain.  Obviously, the transitive access must be compatible
      * with the original grant.
+     *
+     * The current version of Xen does not allow transitive grants
+     * to be mapped.
      */
     struct {
 	struct grant_entry_header hdr;
@@ -229,9 +254,12 @@ union grant_entry_v2 {
 
     uint32_t __spacer[4]; /* Pad to a power of two */
 };
+typedef union grant_entry_v2 grant_entry_v2_t;
 
 typedef uint16_t grant_status_t;
 
+#endif /* __XEN_INTERFACE_VERSION__ */
+
 /***********************************
  * GRANT TABLE QUERIES AND USES
  */
@@ -271,6 +299,8 @@ struct gnttab_map_grant_ref {
     uint64_t dev_bus_addr;
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
+typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
 
 /*
  * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
@@ -293,6 +323,8 @@ struct gnttab_unmap_grant_ref {
     int16_t  status;              /* GNTST_* */
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
+typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
 
 /*
  * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
@@ -310,9 +342,11 @@ struct gnttab_setup_table {
     uint32_t nr_frames;
     /* OUT parameters. */
     int16_t  status;              /* GNTST_* */
-    GUEST_HANDLE(ulong) frame_list;
+    XEN_GUEST_HANDLE(ulong) frame_list;
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table);
+typedef struct gnttab_setup_table gnttab_setup_table_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
 
 /*
  * GNTTABOP_dump_table: Dump the contents of the grant table to the
@@ -326,6 +360,8 @@ struct gnttab_dump_table {
     int16_t status;               /* GNTST_* */
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
+typedef struct gnttab_dump_table gnttab_dump_table_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
 
 /*
  * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
@@ -338,13 +374,16 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
 #define GNTTABOP_transfer                4
 struct gnttab_transfer {
     /* IN parameters. */
-    unsigned long mfn;
+    xen_pfn_t     mfn;
     domid_t       domid;
     grant_ref_t   ref;
     /* OUT parameters. */
     int16_t       status;
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
+typedef struct gnttab_transfer gnttab_transfer_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
+
 
 /*
  * GNTTABOP_copy: Hypervisor based copy
@@ -368,14 +407,16 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
 #define GNTCOPY_source_gref       (1<<_GNTCOPY_source_gref)
 #define _GNTCOPY_dest_gref        (1)
 #define GNTCOPY_dest_gref         (1<<_GNTCOPY_dest_gref)
+#define _GNTCOPY_can_fail         (2)
+#define GNTCOPY_can_fail          (1<<_GNTCOPY_can_fail)
 
 #define GNTTABOP_copy                 5
-struct gnttab_copy {
+typedef struct gnttab_copy {
 	/* IN parameters. */
 	struct {
 		union {
 			grant_ref_t ref;
-			unsigned long   gmfn;
+			xen_pfn_t   gmfn;
 		} u;
 		domid_t  domid;
 		uint16_t offset;
@@ -384,8 +425,9 @@ struct gnttab_copy {
 	uint16_t      flags;          /* GNTCOPY_* */
 	/* OUT parameters. */
 	int16_t       status;
-};
+} gnttab_copy_t;
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_copy);
+DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
 
 /*
  * GNTTABOP_query_size: Query the current and maximum sizes of the shared
@@ -404,6 +446,8 @@ struct gnttab_query_size {
     int16_t  status;              /* GNTST_* */
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
+typedef struct gnttab_query_size gnttab_query_size_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t);
 
 /*
  * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
@@ -426,7 +470,10 @@ struct gnttab_unmap_and_replace {
     int16_t  status;              /* GNTST_* */
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
+typedef struct gnttab_unmap_and_replace gnttab_unmap_and_replace_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t);
 
+#if defined(CONFIG_PARAVIRT_XEN) || __XEN_INTERFACE_VERSION__ >= 0x0003020a
 /*
  * GNTTABOP_set_version: Request a particular version of the grant
  * table shared table structure.  This operation can only be performed
@@ -436,10 +483,13 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
  */
 #define GNTTABOP_set_version          8
 struct gnttab_set_version {
-    /* IN parameters */
+    /* IN/OUT parameters */
     uint32_t version;
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_set_version);
+typedef struct gnttab_set_version gnttab_set_version_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_set_version_t);
+
 
 /*
  * GNTTABOP_get_status_frames: Get the list of frames used to store grant
@@ -460,9 +510,11 @@ struct gnttab_get_status_frames {
     domid_t  dom;
     /* OUT parameters. */
     int16_t  status;              /* GNTST_* */
-    GUEST_HANDLE(uint64_t) frame_list;
+    XEN_GUEST_HANDLE(uint64_t) frame_list;
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_status_frames);
+typedef struct gnttab_get_status_frames gnttab_get_status_frames_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_get_status_frames_t);
 
 /*
  * GNTTABOP_get_version: Get the grant table version which is in
@@ -477,9 +529,27 @@ struct gnttab_get_version {
     uint32_t version;
 };
 DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
+typedef struct gnttab_get_version gnttab_get_version_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_get_version_t);
+
+/*
+ * GNTTABOP_swap_grant_ref: Swap the contents of two grant entries.
+ */
+#define GNTTABOP_swap_grant_ref	      11
+struct gnttab_swap_grant_ref {
+    /* IN parameters */
+    grant_ref_t ref_a;
+    grant_ref_t ref_b;
+    /* OUT parameters */
+    int16_t status;             /* GNTST_* */
+};
+typedef struct gnttab_swap_grant_ref gnttab_swap_grant_ref_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_swap_grant_ref_t);
+
+#endif /* __XEN_INTERFACE_VERSION__ */
 
 /*
- * Bitfield values for update_pin_status.flags.
+ * Bitfield values for gnttab_map_grant_ref.flags.
  */
  /* Map the grant entry for access by I/O devices. */
 #define _GNTMAP_device_map      (0)
@@ -506,6 +576,16 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
 #define _GNTMAP_contains_pte    (4)
 #define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
 
+#define _GNTMAP_can_fail        (5)
+#define GNTMAP_can_fail         (1<<_GNTMAP_can_fail)
+
+/*
+ * Bits to be placed in guest kernel available PTE bits (architecture
+ * dependent; only supported when XENFEAT_gnttab_map_avail_bits is set).
+ */
+#define _GNTMAP_guest_avail0    (16)
+#define GNTMAP_guest_avail_mask ((uint32_t)~0 << _GNTMAP_guest_avail0)
+
 /*
  * Values for error status returns. All errors are -ve.
  */
@@ -519,7 +599,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
 #define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
 #define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
 #define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
-#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary */
+#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary.   */
+#define GNTST_address_too_big (-11) /* transfer page address too large.      */
+#define GNTST_eagain          (-12) /* Operation not done; try again.        */
 
 #define GNTTABOP_error_msgs {                   \
     "okay",                                     \
@@ -532,7 +614,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
     "no spare translation slot in the I/O MMU", \
     "permission denied",                        \
     "bad page",                                 \
-    "copy arguments cross page boundary"        \
+    "copy arguments cross page boundary",       \
+    "page address size too large",              \
+    "operation not done; try again"             \
 }
 
 #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
diff --git a/include/xen/interface/hvm/e820.h b/include/xen/interface/hvm/e820.h
new file mode 100644
index 0000000..5bdc227
--- /dev/null
+++ b/include/xen/interface/hvm/e820.h
@@ -0,0 +1,34 @@
+
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_E820_H__
+#define __XEN_PUBLIC_HVM_E820_H__
+
+/* E820 location in HVM virtual address space. */
+#define HVM_E820_PAGE        0x00090000
+#define HVM_E820_NR_OFFSET   0x000001E8
+#define HVM_E820_OFFSET      0x000002D0
+
+#define HVM_BELOW_4G_RAM_END        0xF0000000
+#define HVM_BELOW_4G_MMIO_START     HVM_BELOW_4G_RAM_END
+#define HVM_BELOW_4G_MMIO_LENGTH    ((1ULL << 32) - HVM_BELOW_4G_MMIO_START)
+
+#endif /* __XEN_PUBLIC_HVM_E820_H__ */
diff --git a/include/xen/interface/hvm/hvm_info_table.h b/include/xen/interface/hvm/hvm_info_table.h
new file mode 100644
index 0000000..36085fa
--- /dev/null
+++ b/include/xen/interface/hvm/hvm_info_table.h
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * hvm/hvm_info_table.h
+ * 
+ * HVM parameter and information table, written into guest memory map.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
+#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
+
+#define HVM_INFO_PFN         0x09F
+#define HVM_INFO_OFFSET      0x800
+#define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
+
+/* Maximum we can support with current vLAPIC ID mapping. */
+#define HVM_MAX_VCPUS        128
+
+struct hvm_info_table {
+    char        signature[8]; /* "HVM INFO" */
+    uint32_t    length;
+    uint8_t     checksum;
+
+    /* Should firmware build APIC descriptors (APIC MADT / MP BIOS)? */
+    uint8_t     apic_mode;
+
+    /* How many CPUs does this domain have? */
+    uint32_t    nr_vcpus;
+
+    /*
+     * MEMORY MAP provided by HVM domain builder.
+     * Notes:
+     *  1. page_to_phys(x) = x << 12
+     *  2. If a field is zero, the corresponding range does not exist.
+     */
+    /*
+     *  0x0 to page_to_phys(low_mem_pgend)-1:
+     *    RAM below 4GB (except for VGA hole 0xA0000-0xBFFFF)
+     */
+    uint32_t    low_mem_pgend;
+    /*
+     *  page_to_phys(reserved_mem_pgstart) to 0xFFFFFFFF:
+     *    Reserved for special memory mappings
+     */
+    uint32_t    reserved_mem_pgstart;
+    /*
+     *  0x100000000 to page_to_phys(high_mem_pgend)-1:
+     *    RAM above 4GB
+     */
+    uint32_t    high_mem_pgend;
+
+    /* Bitmap of which CPUs are online at boot time. */
+    uint8_t     vcpu_online[(HVM_MAX_VCPUS + 7)/8];
+};
+
+#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h
index a4827f4..80f167b 100644
--- a/include/xen/interface/hvm/hvm_op.h
+++ b/include/xen/interface/hvm/hvm_op.h
@@ -21,6 +21,9 @@
 #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
 #define __XEN_PUBLIC_HVM_HVM_OP_H__
 
+#include "../xen.h"
+#include "../trace.h"
+
 /* Get/set subcommands: the second argument of the hypercall is a
  * pointer to a xen_hvm_param struct. */
 #define HVMOP_set_param           0
@@ -31,16 +34,234 @@ struct xen_hvm_param {
     uint64_t value;    /* IN/OUT */
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param);
+typedef struct xen_hvm_param xen_hvm_param_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_param_t);
+
+/* Set the logical level of one of a domain's PCI INTx wires. */
+#define HVMOP_set_pci_intx_level  2
+struct xen_hvm_set_pci_intx_level {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* PCI INTx identification in PCI topology (domain:bus:device:intx). */
+    uint8_t  domain, bus, device, intx;
+    /* Assertion level (0 = unasserted, 1 = asserted). */
+    uint8_t  level;
+};
+typedef struct xen_hvm_set_pci_intx_level xen_hvm_set_pci_intx_level_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t);
+
+/* Set the logical level of one of a domain's ISA IRQ wires. */
+#define HVMOP_set_isa_irq_level   3
+struct xen_hvm_set_isa_irq_level {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* ISA device identification, by ISA IRQ (0-15). */
+    uint8_t  isa_irq;
+    /* Assertion level (0 = unasserted, 1 = asserted). */
+    uint8_t  level;
+};
+typedef struct xen_hvm_set_isa_irq_level xen_hvm_set_isa_irq_level_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t);
+
+#define HVMOP_set_pci_link_route  4
+struct xen_hvm_set_pci_link_route {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* PCI link identifier (0-3). */
+    uint8_t  link;
+    /* ISA IRQ (1-15), or 0 (disable link). */
+    uint8_t  isa_irq;
+};
+typedef struct xen_hvm_set_pci_link_route xen_hvm_set_pci_link_route_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t);
+
+/* Flushes all VCPU TLBs: @arg must be NULL. */
+#define HVMOP_flush_tlbs          5
+
+typedef enum {
+    HVMMEM_ram_rw,             /* Normal read/write guest RAM */
+    HVMMEM_ram_ro,             /* Read-only; writes are discarded */
+    HVMMEM_mmio_dm,            /* Reads and write go to the device model */
+} hvmmem_type_t;
+
+/* Following tools-only interfaces may change in future. */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+/* Track dirty VRAM. */
+#define HVMOP_track_dirty_vram    6
+struct xen_hvm_track_dirty_vram {
+    /* Domain to be tracked. */
+    domid_t  domid;
+    /* First pfn to track. */
+    uint64_aligned_t first_pfn;
+    /* Number of pages to track. */
+    uint64_aligned_t nr;
+    /* OUT variable. */
+    /* Dirty bitmap buffer. */
+    XEN_GUEST_HANDLE_64(uint8) dirty_bitmap;
+};
+typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t);
+
+/* Notify that some pages got modified by the Device Model. */
+#define HVMOP_modified_memory    7
+struct xen_hvm_modified_memory {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* First pfn. */
+    uint64_aligned_t first_pfn;
+    /* Number of pages. */
+    uint64_aligned_t nr;
+};
+typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t);
+
+#define HVMOP_set_mem_type    8
+/* Notify that a region of memory is to be treated in a specific way. */
+struct xen_hvm_set_mem_type {
+    /* Domain to be updated. */
+    domid_t domid;
+    /* Memory type */
+    uint16_t hvmmem_type;
+    /* Number of pages. */
+    uint32_t nr;
+    /* First pfn. */
+    uint64_aligned_t first_pfn;
+};
+typedef struct xen_hvm_set_mem_type xen_hvm_set_mem_type_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_type_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 
 /* Hint from PV drivers for pagetable destruction. */
 #define HVMOP_pagetable_dying       9
 struct xen_hvm_pagetable_dying {
     /* Domain with a pagetable about to be destroyed. */
     domid_t  domid;
+    uint16_t pad[3]; /* align next field on 8-byte boundary */
     /* guest physical address of the toplevel pagetable dying */
-    aligned_u64 gpa;
+    uint64_t gpa;
 };
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying);
 typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t;
-DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t);
- 
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_pagetable_dying_t);
+
+/* Get the current Xen time, in nanoseconds since system boot. */
+#define HVMOP_get_time              10
+struct xen_hvm_get_time {
+    uint64_t now;      /* OUT */
+};
+typedef struct xen_hvm_get_time xen_hvm_get_time_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_time_t);
+
+#define HVMOP_xentrace              11
+struct xen_hvm_xentrace {
+    uint16_t event, extra_bytes;
+    uint8_t extra[TRACE_EXTRA_MAX * sizeof(uint32_t)];
+};
+typedef struct xen_hvm_xentrace xen_hvm_xentrace_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_xentrace_t);
+
+/* Following tools-only interfaces may change in future. */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+#define HVMOP_set_mem_access        12
+typedef enum {
+    HVMMEM_access_n,
+    HVMMEM_access_r,
+    HVMMEM_access_w,
+    HVMMEM_access_rw,
+    HVMMEM_access_x,
+    HVMMEM_access_rx,
+    HVMMEM_access_wx,
+    HVMMEM_access_rwx,
+    HVMMEM_access_rx2rw,       /* Page starts off as r-x, but automatically
+                                * change to r-w on a write */
+    HVMMEM_access_n2rwx,       /* Log access: starts off as n, automatically
+                                * goes to rwx, generating an event without
+                                * pausing the vcpu */
+    HVMMEM_access_default      /* Take the domain default */
+} hvmmem_access_t;
+/* Notify that a region of memory is to have specific access types */
+struct xen_hvm_set_mem_access {
+    /* Domain to be updated. */
+    domid_t domid;
+    /* Memory type */
+    uint16_t hvmmem_access; /* hvm_access_t */
+    /* Number of pages, ignored on setting default access */
+    uint32_t nr;
+    /* First pfn, or ~0ull to set the default access for new pages */
+    uint64_aligned_t first_pfn;
+};
+typedef struct xen_hvm_set_mem_access xen_hvm_set_mem_access_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_access_t);
+
+#define HVMOP_get_mem_access        13
+/* Get the specific access type for that region of memory */
+struct xen_hvm_get_mem_access {
+    /* Domain to be queried. */
+    domid_t domid;
+    /* Memory type: OUT */
+    uint16_t hvmmem_access; /* hvm_access_t */
+    /* pfn, or ~0ull for default access for new pages.  IN */
+    uint64_aligned_t pfn;
+};
+typedef struct xen_hvm_get_mem_access xen_hvm_get_mem_access_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_access_t);
+
+#define HVMOP_inject_trap            14
+/* Inject a trap into a VCPU, which will get taken up on the next
+ * scheduling of it. Note that the caller should know enough of the
+ * state of the CPU before injecting, to know what the effect of
+ * injecting the trap will be.
+ */
+struct xen_hvm_inject_trap {
+    /* Domain to be queried. */
+    domid_t domid;
+    /* VCPU */
+    uint32_t vcpuid;
+    /* Trap number */
+    uint32_t trap;
+    /* Error code, or -1 to skip */
+    uint32_t error_code;
+    /* CR2 for page faults */
+    uint64_aligned_t cr2;
+};
+typedef struct xen_hvm_inject_trap xen_hvm_inject_trap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_inject_trap_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
+#define HVMOP_get_mem_type    15
+/* Return hvmmem_type_t for the specified pfn. */
+struct xen_hvm_get_mem_type {
+    /* Domain to be queried. */
+    domid_t domid;
+    /* OUT variable. */
+    uint16_t mem_type;
+    uint16_t pad[2]; /* align next field on 8-byte boundary */
+    /* IN variable. */
+    uint64_t pfn;
+};
+typedef struct xen_hvm_get_mem_type xen_hvm_get_mem_type_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_type_t);
+
+/* Following tools-only interfaces may change in future. */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+/* MSI injection for emulated devices */
+#define HVMOP_inject_msi         16
+struct xen_hvm_inject_msi {
+    /* Domain to be injected */
+    domid_t   domid;
+    /* Data -- lower 32 bits */
+    uint32_t  data;
+    /* Address (0xfeexxxxx) */
+    uint64_t  addr;
+};
+typedef struct xen_hvm_inject_msi xen_hvm_inject_msi_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_inject_msi_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff --git a/include/xen/interface/hvm/ioreq.h b/include/xen/interface/hvm/ioreq.h
new file mode 100644
index 0000000..4022a1d
--- /dev/null
+++ b/include/xen/interface/hvm/ioreq.h
@@ -0,0 +1,140 @@
+/*
+ * ioreq.h: I/O request definitions for device models
+ * Copyright (c) 2004, Intel Corporation.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _IOREQ_H_
+#define _IOREQ_H_
+
+#define IOREQ_READ      1
+#define IOREQ_WRITE     0
+
+#define STATE_IOREQ_NONE        0
+#define STATE_IOREQ_READY       1
+#define STATE_IOREQ_INPROCESS   2
+#define STATE_IORESP_READY      3
+
+#define IOREQ_TYPE_PIO          0 /* pio */
+#define IOREQ_TYPE_COPY         1 /* mmio ops */
+#define IOREQ_TYPE_TIMEOFFSET   7
+#define IOREQ_TYPE_INVALIDATE   8 /* mapcache */
+
+/*
+ * VMExit dispatcher should cooperate with instruction decoder to
+ * prepare this structure and notify service OS and DM by sending
+ * virq
+ */
+struct ioreq {
+    uint64_t addr;          /* physical address */
+    uint64_t data;          /* data (or paddr of data) */
+    uint32_t count;         /* for rep prefixes */
+    uint32_t size;          /* size in bytes */
+    uint32_t vp_eport;      /* evtchn for notifications to/from device model */
+    uint16_t _pad0;
+    uint8_t state:4;
+    uint8_t data_is_ptr:1;  /* if 1, data above is the guest paddr 
+                             * of the real data to use. */
+    uint8_t dir:1;          /* 1=read, 0=write */
+    uint8_t df:1;
+    uint8_t _pad1:1;
+    uint8_t type;           /* I/O type */
+};
+typedef struct ioreq ioreq_t;
+
+struct shared_iopage {
+    struct ioreq vcpu_ioreq[1];
+};
+typedef struct shared_iopage shared_iopage_t;
+
+struct buf_ioreq {
+    uint8_t  type;   /* I/O type                    */
+    uint8_t  pad:1;
+    uint8_t  dir:1;  /* 1=read, 0=write             */
+    uint8_t  size:2; /* 0=>1, 1=>2, 2=>4, 3=>8. If 8, use two buf_ioreqs */
+    uint32_t addr:20;/* physical address            */
+    uint32_t data;   /* data                        */
+};
+typedef struct buf_ioreq buf_ioreq_t;
+
+#define IOREQ_BUFFER_SLOT_NUM     511 /* 8 bytes each, plus 2 4-byte indexes */
+struct buffered_iopage {
+    unsigned int read_pointer;
+    unsigned int write_pointer;
+    buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM];
+}; /* NB. Size of this structure must be no greater than one page. */
+typedef struct buffered_iopage buffered_iopage_t;
+
+#if defined(__ia64__)
+struct pio_buffer {
+    uint32_t page_offset;
+    uint32_t pointer;
+    uint32_t data_end;
+    uint32_t buf_size;
+    void *opaque;
+};
+
+#define PIO_BUFFER_IDE_PRIMARY   0 /* I/O port = 0x1F0 */
+#define PIO_BUFFER_IDE_SECONDARY 1 /* I/O port = 0x170 */
+#define PIO_BUFFER_ENTRY_NUM     2
+struct buffered_piopage {
+    struct pio_buffer pio[PIO_BUFFER_ENTRY_NUM];
+    uint8_t buffer[1];
+};
+#endif /* defined(__ia64__) */
+
+/*
+ * ACPI Control/Event register locations. Location is controlled by a 
+ * version number in HVM_PARAM_ACPI_IOPORTS_LOCATION.
+ */
+
+/* Version 0 (default): Traditional Xen locations. */
+#define ACPI_PM1A_EVT_BLK_ADDRESS_V0 0x1f40
+#define ACPI_PM1A_CNT_BLK_ADDRESS_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x04)
+#define ACPI_PM_TMR_BLK_ADDRESS_V0   (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x08)
+#define ACPI_GPE0_BLK_ADDRESS_V0     (ACPI_PM_TMR_BLK_ADDRESS_V0 + 0x20)
+#define ACPI_GPE0_BLK_LEN_V0         0x08
+
+/* Version 1: Locations preferred by modern Qemu. */
+#define ACPI_PM1A_EVT_BLK_ADDRESS_V1 0xb000
+#define ACPI_PM1A_CNT_BLK_ADDRESS_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x04)
+#define ACPI_PM_TMR_BLK_ADDRESS_V1   (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x08)
+#define ACPI_GPE0_BLK_ADDRESS_V1     0xafe0
+#define ACPI_GPE0_BLK_LEN_V1         0x04
+
+/* Compatibility definitions for the default location (version 0). */
+#define ACPI_PM1A_EVT_BLK_ADDRESS    ACPI_PM1A_EVT_BLK_ADDRESS_V0
+#define ACPI_PM1A_CNT_BLK_ADDRESS    ACPI_PM1A_CNT_BLK_ADDRESS_V0
+#define ACPI_PM_TMR_BLK_ADDRESS      ACPI_PM_TMR_BLK_ADDRESS_V0
+#define ACPI_GPE0_BLK_ADDRESS        ACPI_GPE0_BLK_ADDRESS_V0
+#define ACPI_GPE0_BLK_LEN            ACPI_GPE0_BLK_LEN_V0
+
+
+#endif /* _IOREQ_H_ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/hvm/params.h b/include/xen/interface/hvm/params.h
index 1888d8c..4dee1ff 100644
--- a/include/xen/interface/hvm/params.h
+++ b/include/xen/interface/hvm/params.h
@@ -33,11 +33,17 @@
  * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows:
  *                  Domain = val[47:32], Bus  = val[31:16],
  *                  DevFn  = val[15: 8], IntX = val[ 1: 0]
- * val[63:56] == 2: val[7:0] is a vector number.
+ * val[63:56] == 2: val[7:0] is a vector number, check for
+ *                  XENFEAT_hvm_callback_vector to know if this delivery
+ *                  method is available.
  * If val == 0 then CPU0 event-channel notifications are not delivered.
  */
 #define HVM_PARAM_CALLBACK_IRQ 0
 
+/*
+ * These are not used by Xen. They are here for convenience of HVM-guest
+ * xenbus implementations.
+ */
 #define HVM_PARAM_STORE_PFN    1
 #define HVM_PARAM_STORE_EVTCHN 2
 
@@ -46,6 +52,20 @@
 #define HVM_PARAM_IOREQ_PFN    5
 
 #define HVM_PARAM_BUFIOREQ_PFN 6
+#define HVM_PARAM_BUFIOREQ_EVTCHN 26
+
+#ifdef __ia64__
+
+#define HVM_PARAM_NVRAM_FD     7
+#define HVM_PARAM_VHPT_SIZE    8
+#define HVM_PARAM_BUFPIOREQ_PFN	9
+
+#elif defined(__i386__) || defined(__x86_64__)
+
+/* Expose Viridian interfaces to this HVM guest? */
+#define HVM_PARAM_VIRIDIAN     9
+
+#endif
 
 /*
  * Set mode for virtual timers (currently x86 only):
@@ -90,6 +110,38 @@
 /* Boolean: Enable aligning all periodic vpts to reduce interrupts */
 #define HVM_PARAM_VPT_ALIGN    16
 
-#define HVM_NR_PARAMS          17
+/* Console debug shared memory ring and event channel */
+#define HVM_PARAM_CONSOLE_PFN    17
+#define HVM_PARAM_CONSOLE_EVTCHN 18
+
+/*
+ * Select location of ACPI PM1a and TMR control blocks. Currently two locations
+ * are supported, specified by version 0 or 1 in this parameter:
+ *   - 0: default, use the old addresses
+ *        PM1A_EVT == 0x1f40; PM1A_CNT == 0x1f44; PM_TMR == 0x1f48
+ *   - 1: use the new default qemu addresses
+ *        PM1A_EVT == 0xb000; PM1A_CNT == 0xb004; PM_TMR == 0xb008
+ * You can find these address definitions in <hvm/ioreq.h>
+ */
+#define HVM_PARAM_ACPI_IOPORTS_LOCATION 19
+
+/* Enable blocking memory events, async or sync (pause vcpu until response)
+ * onchangeonly indicates messages only on a change of value */
+#define HVM_PARAM_MEMORY_EVENT_CR0          20
+#define HVM_PARAM_MEMORY_EVENT_CR3          21
+#define HVM_PARAM_MEMORY_EVENT_CR4          22
+#define HVM_PARAM_MEMORY_EVENT_INT3         23
+#define HVM_PARAM_MEMORY_EVENT_SINGLE_STEP  25
+
+#define HVMPME_MODE_MASK       (3 << 0)
+#define HVMPME_mode_disabled   0
+#define HVMPME_mode_async      1
+#define HVMPME_mode_sync       2
+#define HVMPME_onchangeonly    (1 << 2)
+
+/* Boolean: Enable nestedhvm (hvm only) */
+#define HVM_PARAM_NESTEDHVM    24
+
+#define HVM_NR_PARAMS          27
 
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --git a/include/xen/interface/hvm/save.h b/include/xen/interface/hvm/save.h
new file mode 100644
index 0000000..d0f2661
--- /dev/null
+++ b/include/xen/interface/hvm/save.h
@@ -0,0 +1,111 @@
+/* 
+ * hvm/save.h
+ *
+ * Structure definitions for HVM state that is held by Xen and must
+ * be saved along with the domain's memory and device-model state.
+ * 
+ * Copyright (c) 2007 XenSource Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_SAVE_H__
+#define __XEN_PUBLIC_HVM_SAVE_H__
+
+/*
+ * Structures in this header *must* have the same layout in 32bit 
+ * and 64bit environments: this means that all fields must be explicitly 
+ * sized types and aligned to their sizes, and the structs must be 
+ * a multiple of eight bytes long.
+ *
+ * Only the state necessary for saving and restoring (i.e. fields 
+ * that are analogous to actual hardware state) should go in this file. 
+ * Internal mechanisms should be kept in Xen-private headers.
+ */
+
+#if !defined(__GNUC__) || defined(__STRICT_ANSI__)
+#error "Anonymous structs/unions are a GNU extension."
+#endif
+
+/* 
+ * Each entry is preceded by a descriptor giving its type and length
+ */
+struct hvm_save_descriptor {
+    uint16_t typecode;          /* Used to demux the various types below */
+    uint16_t instance;          /* Further demux within a type */
+    uint32_t length;            /* In bytes, *not* including this descriptor */
+};
+
+
+/* 
+ * Each entry has a datatype associated with it: for example, the CPU state 
+ * is saved as a HVM_SAVE_TYPE(CPU), which has HVM_SAVE_LENGTH(CPU), 
+ * and is identified by a descriptor with typecode HVM_SAVE_CODE(CPU).
+ * DECLARE_HVM_SAVE_TYPE binds these things together with some type-system
+ * ugliness.
+ */
+
+#ifdef __XEN__
+# define DECLARE_HVM_SAVE_TYPE_COMPAT(_x, _code, _type, _ctype, _fix)     \
+    static inline int __HVM_SAVE_FIX_COMPAT_##_x(void *h) { return _fix(h); } \
+    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[2];}; \
+    struct __HVM_SAVE_TYPE_COMPAT_##_x { _ctype t; }                   
+
+# include <xen/lib.h> /* BUG() */
+# define DECLARE_HVM_SAVE_TYPE(_x, _code, _type)                         \
+    static inline int __HVM_SAVE_FIX_COMPAT_##_x(void *h) { BUG(); return -1; } \
+    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[1];}; \
+    struct __HVM_SAVE_TYPE_COMPAT_##_x { _type t; }                   
+#else
+# define DECLARE_HVM_SAVE_TYPE_COMPAT(_x, _code, _type, _ctype, _fix)     \
+    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[2];} 
+
+# define DECLARE_HVM_SAVE_TYPE(_x, _code, _type)                         \
+    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[1];} 
+#endif
+
+#define HVM_SAVE_TYPE(_x) typeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->t)
+#define HVM_SAVE_LENGTH(_x) (sizeof (HVM_SAVE_TYPE(_x)))
+#define HVM_SAVE_CODE(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->c))
+
+#ifdef __XEN__
+# define HVM_SAVE_TYPE_COMPAT(_x) typeof (((struct __HVM_SAVE_TYPE_COMPAT_##_x *)(0))->t)
+# define HVM_SAVE_LENGTH_COMPAT(_x) (sizeof (HVM_SAVE_TYPE_COMPAT(_x)))
+
+# define HVM_SAVE_HAS_COMPAT(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->cpt)-1)
+# define HVM_SAVE_FIX_COMPAT(_x, _dst) __HVM_SAVE_FIX_COMPAT_##_x(_dst)
+#endif
+
+/* 
+ * The series of save records is teminated by a zero-type, zero-length 
+ * descriptor.
+ */
+
+struct hvm_save_end {};
+DECLARE_HVM_SAVE_TYPE(END, 0, struct hvm_save_end);
+
+#if defined(__i386__) || defined(__x86_64__)
+#include "../arch-x86/hvm/save.h"
+#elif defined(__ia64__)
+#include "../arch-ia64/hvm/save.h"
+#else
+#error "unsupported architecture"
+#endif
+
+#endif /* __XEN_PUBLIC_HVM_SAVE_H__ */
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index ee338bf..e9a7702 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -3,6 +3,24 @@
  *
  * Unified block-device I/O interface for Xen guest OSes.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2003-2004, Keir Fraser
  */
 
@@ -24,8 +42,10 @@
  * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
  */
 
-typedef uint16_t blkif_vdev_t;
-typedef uint64_t blkif_sector_t;
+#ifndef blkif_vdev_t
+#define blkif_vdev_t   uint16_t
+#endif
+#define blkif_sector_t uint64_t
 
 /*
  * REQUEST CODES.
@@ -34,7 +54,7 @@ typedef uint64_t blkif_sector_t;
 #define BLKIF_OP_WRITE             1
 /*
  * Recognised only if "feature-barrier" is present in backend xenbus info.
- * The "feature_barrier" node contains a boolean indicating whether barrier
+ * The "feature-barrier" node contains a boolean indicating whether barrier
  * requests are likely to succeed or fail. Either way, a barrier request
  * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
  * the underlying block-device hardware. The boolean simply indicates whether
@@ -43,7 +63,6 @@ typedef uint64_t blkif_sector_t;
  * create the "feature-barrier" node!
  */
 #define BLKIF_OP_WRITE_BARRIER     2
-
 /*
  * Recognised if "feature-flush-cache" is present in backend xenbus
  * info.  A flush will ask the underlying storage hardware to flush its
@@ -57,7 +76,10 @@ typedef uint64_t blkif_sector_t;
  * "feature-flush-cache" node!
  */
 #define BLKIF_OP_FLUSH_DISKCACHE   3
-
+/*
+ * Device specific command packet contained within the request
+ */
+#define BLKIF_OP_PACKET            4
 /*
  * Recognised only if "feature-discard" is present in backend xenbus info.
  * The "feature-discard" node contains a boolean indicating whether trim
@@ -84,21 +106,22 @@ typedef uint64_t blkif_sector_t;
  *     e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc
  * http://www.seagate.com/staticfiles/support/disc/manuals/
  *     Interface%20manuals/100293068c.pdf
- * The backend can optionally provide three extra XenBus attributes to
+ * The backend can optionally provide these extra XenBus attributes to
  * further optimize the discard functionality:
  * 'discard-aligment' - Devices that support discard functionality may
  * internally allocate space in units that are bigger than the exported
  * logical block size. The discard-alignment parameter indicates how many bytes
  * the beginning of the partition is offset from the internal allocation unit's
- * natural alignment.
+ * natural alignment. Do not confuse this with natural disk alignment offset.
  * 'discard-granularity'  - Devices that support discard functionality may
  * internally allocate space using units that are bigger than the logical block
  * size. The discard-granularity parameter indicates the size of the internal
  * allocation unit in bytes if reported by the device. Otherwise the
  * discard-granularity will be set to match the device's physical block size.
+ * It is the minimum size you can discard.
  * 'discard-secure' - All copies of the discarded sectors (potentially created
  * by garbage collection) must also be erased.  To use this feature, the flag
- * BLKIF_DISCARD_SECURE must be set in the blkif_request_trim.
+ * BLKIF_DISCARD_SECURE must be set in the blkif_request_discard.
  */
 #define BLKIF_OP_DISCARD           5
 
@@ -109,48 +132,81 @@ typedef uint64_t blkif_sector_t;
  */
 #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
 
-struct blkif_request_rw {
-	uint8_t        nr_segments;  /* number of segments                   */
-	blkif_vdev_t   handle;       /* only for read/write requests         */
+/*
+ * NB. first_sect and last_sect in blkif_request_segment, as well as
+ * sector_number in blkif_request, are always expressed in 512-byte units.
+ * However they must be properly aligned to the real sector size of the
+ * physical disk, which is reported in the "sector-size" node in the backend
+ * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units.
+ */
+struct blkif_request_segment {
+    grant_ref_t gref;        /* reference to I/O buffer frame        */
+    /* @first_sect: first sector in frame to transfer (inclusive).   */
+    /* @last_sect: last sector in frame to transfer (inclusive).     */
+    uint8_t     first_sect, last_sect;
+};
+
+struct blkif_request {
+    uint8_t        operation;    /* BLKIF_OP_???                         */
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+    uint8_t        nr_segments;  /* number of segments                   */
+    blkif_vdev_t   handle;       /* only for read/write requests         */
+    uint64_t       id;           /* private guest value, echoed in resp  */
+    blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+    struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+#else
+    union {
+        struct __attribute__((__packed__)) blkif_request_rw {
+            uint8_t        nr_segments;  /* number of segments                  */
+            blkif_vdev_t   handle;       /* only for read/write requests        */
 #ifdef CONFIG_X86_64
-	uint32_t       _pad1;	     /* offsetof(blkif_request,u.rw.id) == 8 */
+            uint32_t       _pad1;        /* offsetof(blkif_request,u.rw.id) == 8 */
 #endif
-	uint64_t       id;           /* private guest value, echoed in resp  */
-	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-	struct blkif_request_segment {
-		grant_ref_t gref;        /* reference to I/O buffer frame        */
-		/* @first_sect: first sector in frame to transfer (inclusive).   */
-		/* @last_sect: last sector in frame to transfer (inclusive).     */
-		uint8_t     first_sect, last_sect;
-	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-} __attribute__((__packed__));
-
-struct blkif_request_discard {
-	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero.        */
-#define BLKIF_DISCARD_SECURE (1<<0)  /* ignored if discard-secure=0          */
-	blkif_vdev_t   _pad1;        /* only for read/write requests         */
+            uint64_t       id;           /* private guest value, echoed in resp */
+            blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+            struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+        } rw;
+        struct __attribute__((__packed__)) blkif_request_discard {
+            uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero.        */
+#define BLKIF_DISCARD_SECURE (1<<0)      /* ignored if discard-secure=0          */
+            blkif_vdev_t   _pad1;        /* only for read/write requests         */
 #ifdef CONFIG_X86_64
-	uint32_t       _pad2;        /* offsetof(blkif_req..,u.discard.id)==8*/
+            uint32_t       _pad2;        /* offsetof(blkif_req..,u.discard.id)==8*/
 #endif
-	uint64_t       id;           /* private guest value, echoed in resp  */
-	blkif_sector_t sector_number;
-	uint64_t       nr_sectors;
-	uint8_t        _pad3;
+            uint64_t       id;           /* private guest value, echoed in resp  */
+            blkif_sector_t sector_number;
+            uint64_t       nr_sectors;
+            uint8_t        _pad3;
+        } discard;
+    } u;
 } __attribute__((__packed__));
+#endif
+typedef struct blkif_request blkif_request_t;
 
-struct blkif_request {
-	uint8_t        operation;    /* BLKIF_OP_???                         */
-	union {
-		struct blkif_request_rw rw;
-		struct blkif_request_discard discard;
-	} u;
-} __attribute__((__packed__));
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/*
+ * Cast to this structure when blkif_request.operation == BLKIF_OP_TRIM
+ * sizeof(struct blkif_request_discard) <= sizeof(struct blkif_request)
+ */
+struct blkif_request_discard {
+    uint8_t        operation;    /* BLKIF_OP_DISCARD                     */
+    uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+#define BLKIF_DISCARD_SECURE (1<<0)  /* ignored if discard-secure=0      */
+    blkif_vdev_t   handle;       /* same as for read/write requests      */
+    uint64_t       id;           /* private guest value, echoed in resp  */
+    blkif_sector_t sector_number;/* start sector idx on disk             */
+    uint64_t       nr_sectors;   /* number of contiguous sectors to discard*/
+};
+typedef struct blkif_request_discard blkif_request_discard_t;
+#endif
 
 struct blkif_response {
-	uint64_t        id;              /* copied from request */
-	uint8_t         operation;       /* copied from request */
-	int16_t         status;          /* BLKIF_RSP_???       */
+    uint64_t        id;              /* copied from request */
+    uint8_t         operation;       /* copied from request */
+    int16_t         status;          /* BLKIF_RSP_???       */
 };
+typedef struct blkif_response blkif_response_t;
 
 /*
  * STATUS RETURN CODES.
diff --git a/include/xen/interface/io/cdromif.h b/include/xen/interface/io/cdromif.h
new file mode 100644
index 0000000..b691056
--- /dev/null
+++ b/include/xen/interface/io/cdromif.h
@@ -0,0 +1,120 @@
+/******************************************************************************
+ * cdromif.h
+ *
+ * Shared definitions between backend driver and Xen guest Virtual CDROM
+ * block device.
+ *
+ * Copyright (c) 2008, Pat Campell  plc@novell.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_IO_CDROMIF_H__
+#define __XEN_PUBLIC_IO_CDROMIF_H__
+
+/*
+ * Queries backend for CDROM support
+ */
+#define XEN_TYPE_CDROM_SUPPORT         _IO('c', 1)
+
+struct xen_cdrom_support
+{
+	uint32_t type;
+	int8_t ret;                  /* returned, 0 succeded, -1 error */
+	int8_t err;                  /* returned, backend errno */
+	int8_t supported;            /* returned, 1 supported */
+};
+
+/*
+ * Opens backend device, returns drive geometry or
+ * any encountered errors
+ */
+#define XEN_TYPE_CDROM_OPEN            _IO('c', 2)
+
+struct xen_cdrom_open
+{
+	uint32_t type;
+	int8_t ret;
+	int8_t err;
+	int8_t pad;
+	int8_t media_present;        /* returned */
+	uint32_t sectors;            /* returned */
+	uint32_t sector_size;        /* returned */
+	int32_t payload_offset;      /* offset to backend node name payload */
+};
+
+/*
+ * Queries backend for media changed status
+ */
+#define XEN_TYPE_CDROM_MEDIA_CHANGED   _IO('c', 3)
+
+struct xen_cdrom_media_changed
+{
+	uint32_t type;
+	int8_t ret;
+	int8_t err;
+	int8_t media_changed;        /* returned */
+};
+
+/*
+ * Sends vcd generic CDROM packet to backend, followed
+ * immediately by the vcd_generic_command payload
+ */
+#define XEN_TYPE_CDROM_PACKET          _IO('c', 4)
+
+struct xen_cdrom_packet
+{
+	uint32_t type;
+	int8_t ret;
+	int8_t err;
+	int8_t pad[2];
+	int32_t payload_offset;      /* offset to vcd_generic_command payload */
+};
+
+/* CDROM_PACKET_COMMAND, payload for XEN_TYPE_CDROM_PACKET */
+struct vcd_generic_command
+{
+	uint8_t  cmd[CDROM_PACKET_SIZE];
+	uint8_t  pad[4];
+	uint32_t buffer_offset;
+	uint32_t buflen;
+	int32_t  stat;
+	uint32_t sense_offset;
+	uint8_t  data_direction;
+	uint8_t  pad1[3];
+	int32_t  quiet;
+	int32_t  timeout;
+};
+
+union xen_block_packet
+{
+	uint32_t type;
+	struct xen_cdrom_support xcs;
+	struct xen_cdrom_open xco;
+	struct xen_cdrom_media_changed xcmc;
+	struct xen_cdrom_packet xcp;
+};
+
+#define PACKET_PAYLOAD_OFFSET (sizeof(struct xen_cdrom_packet))
+#define PACKET_SENSE_OFFSET (PACKET_PAYLOAD_OFFSET + sizeof(struct vcd_generic_command))
+#define PACKET_BUFFER_OFFSET (PACKET_SENSE_OFFSET + sizeof(struct request_sense))
+#define MAX_PACKET_DATA (PAGE_SIZE - sizeof(struct xen_cdrom_packet) - \
+            sizeof(struct vcd_generic_command) - sizeof(struct request_sense))
+
+#endif
diff --git a/include/xen/interface/io/console.h b/include/xen/interface/io/console.h
index e563de7..70906df 100644
--- a/include/xen/interface/io/console.h
+++ b/include/xen/interface/io/console.h
@@ -3,6 +3,24 @@
  *
  * Console I/O interface for Xen guest OSes.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2005, Keir Fraser
  */
 
diff --git a/include/xen/interface/io/fbif.h b/include/xen/interface/io/fbif.h
index 974a51e..4d6dbca 100644
--- a/include/xen/interface/io/fbif.h
+++ b/include/xen/interface/io/fbif.h
@@ -77,13 +77,32 @@ union xenfb_out_event {
 
 /*
  * Frontends should ignore unknown in events.
- * No in events currently defined.
  */
 
+/*
+ * Framebuffer refresh period advice
+ * Backend sends it to advise the frontend their preferred period of
+ * refresh.  Frontends that keep the framebuffer constantly up-to-date
+ * just ignore it.  Frontends that use the advice should immediately
+ * refresh the framebuffer (and send an update notification event if
+ * those have been requested), then use the update frequency to guide
+ * their periodical refreshs.
+ */
+#define XENFB_TYPE_REFRESH_PERIOD 1
+#define XENFB_NO_REFRESH 0
+
+struct xenfb_refresh_period
+{
+    uint8_t type;    /* XENFB_TYPE_UPDATE_PERIOD */
+    uint32_t period; /* period of refresh, in ms,
+                      * XENFB_NO_REFRESH if no refresh is needed */
+};
+
 #define XENFB_IN_EVENT_SIZE 40
 
 union xenfb_in_event {
 	uint8_t type;
+	struct xenfb_refresh_period refresh_period;
 	char pad[XENFB_IN_EVENT_SIZE];
 };
 
@@ -127,7 +146,12 @@ struct xenfb_page {
 	 * Should be enough for a while with room leftover for
 	 * expansion.
 	 */
+#ifndef CONFIG_PARAVIRT_XEN
 	unsigned long pd[256];
+#else
+	/* Two directory pages should be enough for a while. */
+	unsigned long pd[2];
+#endif
 };
 
 /*
diff --git a/include/xen/interface/io/fsif.h b/include/xen/interface/io/fsif.h
new file mode 100644
index 0000000..8fc2174
--- /dev/null
+++ b/include/xen/interface/io/fsif.h
@@ -0,0 +1,192 @@
+/******************************************************************************
+ * fsif.h
+ * 
+ * Interface to FS level split device drivers.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007, Grzegorz Milos, <gm281@cam.ac.uk>.
+ */
+
+#ifndef __XEN_PUBLIC_IO_FSIF_H__
+#define __XEN_PUBLIC_IO_FSIF_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+#define REQ_FILE_OPEN        1
+#define REQ_FILE_CLOSE       2
+#define REQ_FILE_READ        3
+#define REQ_FILE_WRITE       4
+#define REQ_STAT             5
+#define REQ_FILE_TRUNCATE    6
+#define REQ_REMOVE           7
+#define REQ_RENAME           8
+#define REQ_CREATE           9
+#define REQ_DIR_LIST        10
+#define REQ_CHMOD           11
+#define REQ_FS_SPACE        12
+#define REQ_FILE_SYNC       13
+
+struct fsif_open_request {
+    grant_ref_t gref;
+};
+
+struct fsif_close_request {
+    uint32_t fd;
+};
+
+struct fsif_read_request {
+    uint32_t fd;
+    int32_t pad;
+    uint64_t len;
+    uint64_t offset;
+    grant_ref_t grefs[1];  /* Variable length */
+};
+
+struct fsif_write_request {
+    uint32_t fd;
+    int32_t pad;
+    uint64_t len;
+    uint64_t offset;
+    grant_ref_t grefs[1];  /* Variable length */
+};
+
+struct fsif_stat_request {
+    uint32_t fd;
+};
+
+/* This structure is a copy of some fields from stat structure, returned
+ * via the ring. */
+struct fsif_stat_response {
+    int32_t  stat_mode;
+    uint32_t stat_uid;
+    uint32_t stat_gid;
+    int32_t  stat_ret;
+    int64_t  stat_size;
+    int64_t  stat_atime;
+    int64_t  stat_mtime;
+    int64_t  stat_ctime;
+};
+
+struct fsif_truncate_request {
+    uint32_t fd;
+    int32_t pad;
+    int64_t length;
+};
+
+struct fsif_remove_request {
+    grant_ref_t gref;
+};
+
+struct fsif_rename_request {
+    uint16_t old_name_offset;
+    uint16_t new_name_offset;
+    grant_ref_t gref;
+};
+
+struct fsif_create_request {
+    int8_t directory;
+    int8_t pad;
+    int16_t pad2;
+    int32_t mode;
+    grant_ref_t gref;
+};
+
+struct fsif_list_request {
+    uint32_t offset;
+    grant_ref_t gref;
+};
+
+#define NR_FILES_SHIFT  0
+#define NR_FILES_SIZE   16   /* 16 bits for the number of files mask */
+#define NR_FILES_MASK   (((1ULL << NR_FILES_SIZE) - 1) << NR_FILES_SHIFT)
+#define ERROR_SIZE      32   /* 32 bits for the error mask */
+#define ERROR_SHIFT     (NR_FILES_SIZE + NR_FILES_SHIFT)
+#define ERROR_MASK      (((1ULL << ERROR_SIZE) - 1) << ERROR_SHIFT)
+#define HAS_MORE_SHIFT  (ERROR_SHIFT + ERROR_SIZE)    
+#define HAS_MORE_FLAG   (1ULL << HAS_MORE_SHIFT)
+
+struct fsif_chmod_request {
+    uint32_t fd;
+    int32_t mode;
+};
+
+struct fsif_space_request {
+    grant_ref_t gref;
+};
+
+struct fsif_sync_request {
+    uint32_t fd;
+};
+
+
+/* FS operation request */
+struct fsif_request {
+    uint8_t type;                 /* Type of the request                  */
+    uint8_t pad;
+    uint16_t id;                  /* Request ID, copied to the response   */
+    uint32_t pad2;
+    union {
+        struct fsif_open_request     fopen;
+        struct fsif_close_request    fclose;
+        struct fsif_read_request     fread;
+        struct fsif_write_request    fwrite;
+        struct fsif_stat_request     fstat;
+        struct fsif_truncate_request ftruncate;
+        struct fsif_remove_request   fremove;
+        struct fsif_rename_request   frename;
+        struct fsif_create_request   fcreate;
+        struct fsif_list_request     flist;
+        struct fsif_chmod_request    fchmod;
+        struct fsif_space_request    fspace;
+        struct fsif_sync_request     fsync;
+    } u;
+};
+typedef struct fsif_request fsif_request_t;
+
+/* FS operation response */
+struct fsif_response {
+    uint16_t id;
+    uint16_t pad1;
+    uint32_t pad2;
+    union {
+        uint64_t ret_val;
+        struct fsif_stat_response fstat;
+    } u;
+};
+
+typedef struct fsif_response fsif_response_t;
+
+#define FSIF_RING_ENTRY_SIZE   64
+
+#define FSIF_NR_READ_GNTS  ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_read_request)) /  \
+                                sizeof(grant_ref_t) + 1)
+#define FSIF_NR_WRITE_GNTS ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_write_request)) / \
+                                sizeof(grant_ref_t) + 1)
+
+DEFINE_RING_TYPES(fsif, struct fsif_request, struct fsif_response);
+
+#define STATE_INITIALISED     "init"
+#define STATE_READY           "ready"
+#define STATE_CLOSING         "closing"
+#define STATE_CLOSED          "closed"
+
+
+#endif
diff --git a/include/xen/interface/io/libxenvchan.h b/include/xen/interface/io/libxenvchan.h
new file mode 100644
index 0000000..5c3d3d4
--- /dev/null
+++ b/include/xen/interface/io/libxenvchan.h
@@ -0,0 +1,97 @@
+/**
+ * @file
+ * @section AUTHORS
+ *
+ * Copyright (C) 2010  Rafal Wojtczuk  <rafal@invisiblethingslab.com>
+ *
+ *  Authors:
+ *       Rafal Wojtczuk  <rafal@invisiblethingslab.com>
+ *       Daniel De Graaf <dgdegra@tycho.nsa.gov>
+ *
+ * @section LICENSE
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ * @section DESCRIPTION
+ *
+ *  Originally borrowed from the Qubes OS Project, http://www.qubes-os.org,
+ *  this code has been substantially rewritten to use the gntdev and gntalloc
+ *  devices instead of raw MFNs and map_foreign_range.
+ *
+ *  This is a library for inter-domain communication.  A standard Xen ring
+ *  buffer is used, with a datagram-based interface built on top.  The grant
+ *  reference and event channels are shared in XenStore under a user-specified
+ *  path.
+ *
+ *  The ring.h macros define an asymmetric interface to a shared data structure
+ *  that assumes all rings reside in a single contiguous memory space. This is
+ *  not suitable for vchan because the interface to the ring is symmetric except
+ *  for the setup. Unlike the producer-consumer rings defined in ring.h, the
+ *  size of the rings used in vchan are determined at execution time instead of
+ *  compile time, so the macros in ring.h cannot be used to access the rings.
+ */
+
+#include <stdint.h>
+#include <sys/types.h>
+
+struct ring_shared {
+	uint32_t cons, prod;
+};
+
+#define VCHAN_NOTIFY_WRITE 0x1
+#define VCHAN_NOTIFY_READ 0x2
+
+/**
+ * vchan_interface: primary shared data structure
+ */
+struct vchan_interface {
+	/**
+	 * Standard consumer/producer interface, one pair per buffer
+	 * left is client write, server read
+	 * right is client read, server write
+	 */
+	struct ring_shared left, right;
+	/**
+	 * size of the rings, which determines their location
+	 * 10   - at offset 1024 in ring's page
+	 * 11   - at offset 2048 in ring's page
+	 * 12+  - uses 2^(N-12) grants to describe the multi-page ring
+	 * These should remain constant once the page is shared.
+	 * Only one of the two orders can be 10 (or 11).
+	 */
+	uint16_t left_order, right_order;
+	/**
+	 * Shutdown detection:
+	 *  0: client (or server) has exited
+	 *  1: client (or server) is connected
+	 *  2: client has not yet connected
+	 */
+	uint8_t cli_live, srv_live;
+	/**
+	 * Notification bits:
+	 *  VCHAN_NOTIFY_WRITE: send notify when data is written
+	 *  VCHAN_NOTIFY_READ: send notify when data is read (consumed)
+	 * cli_notify is used for the client to inform the server of its action
+	 */
+	uint8_t cli_notify, srv_notify;
+	/**
+	 * Grant list: ordering is left, right. Must not extend into actual ring
+	 * or grow beyond the end of the initial shared page.
+	 * These should remain constant once the page is shared, to allow
+	 * for possible remapping by a client that restarts.
+	 */
+	uint32_t grants[0];
+};
+
diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h
index cb94668..c4763fe 100644
--- a/include/xen/interface/io/netif.h
+++ b/include/xen/interface/io/netif.h
@@ -3,6 +3,24 @@
  *
  * Unified network-device I/O interface for Xen guest OSes.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2003-2004, Keir Fraser
  */
 
@@ -47,18 +65,21 @@
 #define _XEN_NETTXF_extra_info		(3)
 #define  XEN_NETTXF_extra_info		(1U<<_XEN_NETTXF_extra_info)
 
-struct xen_netif_tx_request {
+struct netif_tx_request {
     grant_ref_t gref;      /* Reference to buffer page */
     uint16_t offset;       /* Offset within buffer page */
     uint16_t flags;        /* XEN_NETTXF_* */
     uint16_t id;           /* Echoed in response message. */
     uint16_t size;         /* Packet size in bytes.       */
 };
+typedef struct netif_tx_request netif_tx_request_t;
 
-/* Types of xen_netif_extra_info descriptors. */
+/* Types of netif_extra_info descriptors. */
 #define XEN_NETIF_EXTRA_TYPE_NONE	(0)  /* Never used - invalid */
 #define XEN_NETIF_EXTRA_TYPE_GSO	(1)  /* u.gso */
-#define XEN_NETIF_EXTRA_TYPE_MAX	(2)
+#define XEN_NETIF_EXTRA_TYPE_MCAST_ADD	(2)  /* u.mcast */
+#define XEN_NETIF_EXTRA_TYPE_MCAST_DEL	(3)  /* u.mcast */
+#define XEN_NETIF_EXTRA_TYPE_MAX	(4)
 
 /* xen_netif_extra_info flags. */
 #define _XEN_NETIF_EXTRA_FLAG_MORE	(0)
@@ -71,11 +92,14 @@ struct xen_netif_tx_request {
  * This structure needs to fit within both netif_tx_request and
  * netif_rx_response for compatibility.
  */
-struct xen_netif_extra_info {
+struct netif_extra_info {
 	uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
 	uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
 
 	union {
+		/*
+		 * XEN_NETIF_EXTRA_TYPE_GSO:
+		 */
 		struct {
 			/*
 			 * Maximum payload size of each segment. For
@@ -101,19 +125,39 @@ struct xen_netif_extra_info {
 			uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
 		} gso;
 
+		/*
+		 * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}:
+		 * Backend advertises availability via
+		 * 'feature-multicast-control' xenbus node containing value
+		 * '1'.
+		 * Frontend requests this feature by advertising
+		 * 'request-multicast-control' xenbus node containing value
+		 * '1'. If multicast control is requested then multicast
+		 * flooding is disabled and the frontend must explicitly
+		 * register its interest in multicast groups using dummy
+		 * transmit requests containing MCAST_{ADD,DEL} extra-info
+		 * fragments.
+		 */
+		struct {
+			uint8_t addr[6]; /* Address to add/remove. */
+		} mcast;
+
 		uint16_t pad[3];
 	} u;
 };
+typedef struct netif_extra_info netif_extra_info_t;
 
-struct xen_netif_tx_response {
+struct netif_tx_response {
 	uint16_t id;
 	int16_t  status;       /* XEN_NETIF_RSP_* */
 };
+typedef struct netif_tx_response netif_tx_response_t;
 
-struct xen_netif_rx_request {
+struct netif_rx_request {
 	uint16_t    id;        /* Echoed in response message.        */
 	grant_ref_t gref;      /* Reference to incoming granted frame */
 };
+typedef struct netif_rx_request netif_rx_request_t;
 
 /* Packet data has been validated against protocol checksum. */
 #define _XEN_NETRXF_data_validated	(0)
@@ -135,28 +179,39 @@ struct xen_netif_rx_request {
 #define _XEN_NETRXF_gso_prefix		(4)
 #define  XEN_NETRXF_gso_prefix		(1U<<_XEN_NETRXF_gso_prefix)
 
-struct xen_netif_rx_response {
+struct netif_rx_response {
     uint16_t id;
     uint16_t offset;       /* Offset in page of start of received packet  */
     uint16_t flags;        /* XEN_NETRXF_* */
     int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
 };
+typedef struct netif_rx_response netif_rx_response_t;
 
 /*
  * Generate netif ring structures and types.
  */
 
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response);
+DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
+#else
+#define xen_netif_tx_request netif_tx_request
+#define xen_netif_rx_request netif_rx_request
+#define xen_netif_tx_response netif_tx_response
+#define xen_netif_rx_response netif_rx_response
 DEFINE_RING_TYPES(xen_netif_tx,
 		  struct xen_netif_tx_request,
 		  struct xen_netif_tx_response);
 DEFINE_RING_TYPES(xen_netif_rx,
 		  struct xen_netif_rx_request,
 		  struct xen_netif_rx_response);
+#define xen_netif_extra_info netif_extra_info
+#endif
 
 #define XEN_NETIF_RSP_DROPPED	-2
 #define XEN_NETIF_RSP_ERROR	-1
 #define XEN_NETIF_RSP_OKAY	 0
-/* No response: used for auxiliary requests (e.g., xen_netif_extra_info). */
+/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
 #define XEN_NETIF_RSP_NULL	 1
 
 #endif
diff --git a/include/xen/interface/io/protocols.h b/include/xen/interface/io/protocols.h
index 01fc8ae..4a22f1f 100644
--- a/include/xen/interface/io/protocols.h
+++ b/include/xen/interface/io/protocols.h
@@ -1,3 +1,25 @@
+/******************************************************************************
+ * protocols.h
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
 #ifndef __XEN_PROTOCOLS_H__
 #define __XEN_PROTOCOLS_H__
 
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
index 75271b9..ae95c7b 100644
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -3,12 +3,38 @@
  *
  * Shared producer-consumer ring macros.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Tim Deegan and Andrew Warfield November 2004.
  */
 
 #ifndef __XEN_PUBLIC_IO_RING_H__
 #define __XEN_PUBLIC_IO_RING_H__
 
+#include "../xen-compat.h"
+
+#if __XEN_INTERFACE_VERSION__ < 0x00030208
+#define xen_mb()  mb()
+#define xen_rmb() rmb()
+#define xen_wmb() wmb()
+#endif
+
 typedef unsigned int RING_IDX;
 
 /* Round a 32-bit unsigned constant down to the nearest power of two. */
@@ -38,34 +64,32 @@ typedef unsigned int RING_IDX;
  * Macros to make the correct C datatypes for a new kind of ring.
  *
  * To make a new ring datatype, you need to have two message structures,
- * let's say struct request, and struct response already defined.
+ * let's say request_t, and response_t already defined.
  *
  * In a header where you want the ring datatype declared, you then do:
  *
- *     DEFINE_RING_TYPES(mytag, struct request, struct response);
+ *     DEFINE_RING_TYPES(mytag, request_t, response_t);
  *
  * These expand out to give you a set of types, as you can see below.
  * The most important of these are:
  *
- *     struct mytag_sring      - The shared ring.
- *     struct mytag_front_ring - The 'front' half of the ring.
- *     struct mytag_back_ring  - The 'back' half of the ring.
+ *     mytag_sring_t      - The shared ring.
+ *     mytag_front_ring_t - The 'front' half of the ring.
+ *     mytag_back_ring_t  - The 'back' half of the ring.
  *
  * To initialize a ring in your code you need to know the location and size
  * of the shared memory area (PAGE_SIZE, for instance). To initialise
  * the front half:
  *
- *     struct mytag_front_ring front_ring;
- *     SHARED_RING_INIT((struct mytag_sring *)shared_page);
- *     FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page,
- *		       PAGE_SIZE);
+ *     mytag_front_ring_t front_ring;
+ *     SHARED_RING_INIT((mytag_sring_t *)shared_page);
+ *     FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
  *
  * Initializing the back follows similarly (note that only the front
  * initializes the shared ring):
  *
- *     struct mytag_back_ring back_ring;
- *     BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page,
- *		      PAGE_SIZE);
+ *     mytag_back_ring_t back_ring;
+ *     BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
  */
 
 #define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)			\
@@ -80,7 +104,16 @@ union __name##_sring_entry {						\
 struct __name##_sring {							\
     RING_IDX req_prod, req_event;					\
     RING_IDX rsp_prod, rsp_event;					\
-    uint8_t  pad[48];							\
+    union {								\
+        struct {							\
+            uint8_t smartpoll_active;					\
+        } netif;							\
+        struct {							\
+            uint8_t msg;						\
+        } tapif_user;							\
+        uint8_t pvt_pad[4];						\
+    } private;								\
+    uint8_t __pad[44];							\
     union __name##_sring_entry ring[1]; /* variable-length */		\
 };									\
 									\
@@ -98,7 +131,12 @@ struct __name##_back_ring {						\
     RING_IDX req_cons;							\
     unsigned int nr_ents;						\
     struct __name##_sring *sring;					\
-};
+};									\
+									\
+/* Syntactic sugar */							\
+typedef struct __name##_sring __name##_sring_t;			\
+typedef struct __name##_front_ring __name##_front_ring_t;		\
+typedef struct __name##_back_ring __name##_back_ring_t
 
 /*
  * Macros for manipulating rings.
@@ -119,7 +157,8 @@ struct __name##_back_ring {						\
 #define SHARED_RING_INIT(_s) do {					\
     (_s)->req_prod  = (_s)->rsp_prod  = 0;				\
     (_s)->req_event = (_s)->rsp_event = 1;				\
-    memset((_s)->pad, 0, sizeof((_s)->pad));				\
+    (void)memset((_s)->private.pvt_pad, 0, sizeof((_s)->private.pvt_pad)); \
+    (void)memset((_s)->__pad, 0, sizeof((_s)->__pad));			\
 } while(0)
 
 #define FRONT_RING_INIT(_r, _s, __size) do {				\
@@ -169,6 +208,7 @@ struct __name##_back_ring {						\
 #define RING_HAS_UNCONSUMED_RESPONSES(_r)				\
     ((_r)->sring->rsp_prod - (_r)->rsp_cons)
 
+#ifdef __GNUC__
 #define RING_HAS_UNCONSUMED_REQUESTS(_r)				\
     ({									\
 	unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;	\
@@ -176,6 +216,14 @@ struct __name##_back_ring {						\
 			   ((_r)->req_cons - (_r)->rsp_prod_pvt);	\
 	req < rsp ? req : rsp;						\
     })
+#else
+/* Same as above, but without the nice GCC ({ ... }) syntax. */
+#define RING_HAS_UNCONSUMED_REQUESTS(_r)				\
+    ((((_r)->sring->req_prod - (_r)->req_cons) <			\
+      (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ?	\
+     ((_r)->sring->req_prod - (_r)->req_cons) :			\
+     (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt)))
+#endif
 
 /* Direct access to individual ring elements, by index. */
 #define RING_GET_REQUEST(_r, _idx)					\
@@ -189,12 +237,12 @@ struct __name##_back_ring {						\
     (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
 
 #define RING_PUSH_REQUESTS(_r) do {					\
-    wmb(); /* back sees requests /before/ updated producer index */	\
+    xen_wmb(); /* back sees requests /before/ updated producer index */	\
     (_r)->sring->req_prod = (_r)->req_prod_pvt;				\
 } while (0)
 
 #define RING_PUSH_RESPONSES(_r) do {					\
-    wmb(); /* front sees responses /before/ updated producer index */	\
+    xen_wmb(); /* front sees resps /before/ updated producer index */	\
     (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;				\
 } while (0)
 
@@ -231,9 +279,9 @@ struct __name##_back_ring {						\
 #define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {		\
     RING_IDX __old = (_r)->sring->req_prod;				\
     RING_IDX __new = (_r)->req_prod_pvt;				\
-    wmb(); /* back sees requests /before/ updated producer index */	\
+    xen_wmb(); /* back sees requests /before/ updated producer index */	\
     (_r)->sring->req_prod = __new;					\
-    mb(); /* back sees new requests /before/ we check req_event */	\
+    xen_mb(); /* back sees new requests /before/ we check req_event */	\
     (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <		\
 		 (RING_IDX)(__new - __old));				\
 } while (0)
@@ -241,9 +289,9 @@ struct __name##_back_ring {						\
 #define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {		\
     RING_IDX __old = (_r)->sring->rsp_prod;				\
     RING_IDX __new = (_r)->rsp_prod_pvt;				\
-    wmb(); /* front sees responses /before/ updated producer index */	\
+    xen_wmb(); /* front sees resps /before/ updated producer index */	\
     (_r)->sring->rsp_prod = __new;					\
-    mb(); /* front sees new responses /before/ we check rsp_event */	\
+    xen_mb(); /* front sees new resps /before/ we check rsp_event */	\
     (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <		\
 		 (RING_IDX)(__new - __old));				\
 } while (0)
@@ -252,7 +300,7 @@ struct __name##_back_ring {						\
     (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);			\
     if (_work_to_do) break;						\
     (_r)->sring->req_event = (_r)->req_cons + 1;			\
-    mb();								\
+    xen_mb();								\
     (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);			\
 } while (0)
 
@@ -260,7 +308,7 @@ struct __name##_back_ring {						\
     (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);			\
     if (_work_to_do) break;						\
     (_r)->sring->rsp_event = (_r)->rsp_cons + 1;			\
-    mb();								\
+    xen_mb();								\
     (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);			\
 } while (0)
 
diff --git a/include/xen/interface/io/tpmif.h b/include/xen/interface/io/tpmif.h
new file mode 100644
index 0000000..02ccdab
--- /dev/null
+++ b/include/xen/interface/io/tpmif.h
@@ -0,0 +1,77 @@
+/******************************************************************************
+ * tpmif.h
+ *
+ * TPM I/O interface for Xen guest OSes.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ * Grant table support: Mahadevan Gomathisankaran
+ *
+ * This code has been derived from tools/libxc/xen/io/netif.h
+ *
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __XEN_PUBLIC_IO_TPMIF_H__
+#define __XEN_PUBLIC_IO_TPMIF_H__
+
+#include "../grant_table.h"
+
+struct tpmif_tx_request {
+    unsigned long addr;   /* Machine address of packet.   */
+    grant_ref_t ref;      /* grant table access reference */
+    uint16_t unused;
+    uint16_t size;        /* Packet size in bytes.        */
+};
+typedef struct tpmif_tx_request tpmif_tx_request_t;
+
+/*
+ * The TPMIF_TX_RING_SIZE defines the number of pages the
+ * front-end and backend can exchange (= size of array).
+ */
+typedef uint32_t TPMIF_RING_IDX;
+
+#define TPMIF_TX_RING_SIZE 1
+
+/* This structure must fit in a memory page. */
+
+struct tpmif_ring {
+    struct tpmif_tx_request req;
+};
+typedef struct tpmif_ring tpmif_ring_t;
+
+struct tpmif_tx_interface {
+    struct tpmif_ring ring[TPMIF_TX_RING_SIZE];
+};
+typedef struct tpmif_tx_interface tpmif_tx_interface_t;
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/io/usbif.h b/include/xen/interface/io/usbif.h
new file mode 100644
index 0000000..6099c29
--- /dev/null
+++ b/include/xen/interface/io/usbif.h
@@ -0,0 +1,151 @@
+/*
+ * usbif.h
+ *
+ * USB I/O interface for Xen guest OSes.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_IO_USBIF_H__
+#define __XEN_PUBLIC_IO_USBIF_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+enum usb_spec_version {
+	USB_VER_UNKNOWN = 0,
+	USB_VER_USB11,
+	USB_VER_USB20,
+	USB_VER_USB30,	/* not supported yet */
+};
+
+/*
+ *  USB pipe in usbif_request
+ *
+ *  bits 0-5 are specific bits for virtual USB driver.
+ *  bits 7-31 are standard urb pipe.
+ *
+ *  - port number(NEW):	bits 0-4
+ *  				(USB_MAXCHILDREN is 31)
+ *
+ *  - operation flag(NEW):	bit 5
+ *  				(0 = submit urb,
+ *  				 1 = unlink urb)
+ *
+ *  - direction:		bit 7
+ *  				(0 = Host-to-Device [Out]
+ *                           1 = Device-to-Host [In])
+ *
+ *  - device address:	bits 8-14
+ *
+ *  - endpoint:		bits 15-18
+ *
+ *  - pipe type:		bits 30-31
+ *  				(00 = isochronous, 01 = interrupt,
+ *                           10 = control, 11 = bulk)
+ */
+#define usbif_pipeportnum(pipe) ((pipe) & 0x1f)
+#define usbif_setportnum_pipe(pipe, portnum) \
+	((pipe)|(portnum))
+
+#define usbif_pipeunlink(pipe) ((pipe) & 0x20)
+#define usbif_pipesubmit(pipe) (!usbif_pipeunlink(pipe))
+#define usbif_setunlink_pipe(pipe) ((pipe)|(0x20))
+
+#define USBIF_BACK_MAX_PENDING_REQS (128)
+#define USBIF_MAX_SEGMENTS_PER_REQUEST (16)
+
+/*
+ * RING for transferring urbs.
+ */
+struct usbif_request_segment {
+	grant_ref_t gref;
+	uint16_t offset;
+	uint16_t length;
+};
+
+struct usbif_urb_request {
+	uint16_t id; /* request id */
+	uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */
+
+	/* basic urb parameter */
+	uint32_t pipe;
+	uint16_t transfer_flags;
+	uint16_t buffer_length;
+	union {
+		uint8_t ctrl[8]; /* setup_packet (Ctrl) */
+
+		struct {
+			uint16_t interval; /* maximum (1024*8) in usb core */
+			uint16_t start_frame; /* start frame */
+			uint16_t number_of_packets; /* number of ISO packet */
+			uint16_t nr_frame_desc_segs; /* number of iso_frame_desc segments */
+		} isoc;
+
+		struct {
+			uint16_t interval; /* maximum (1024*8) in usb core */
+			uint16_t pad[3];
+		} intr;
+
+		struct {
+			uint16_t unlink_id; /* unlink request id */
+			uint16_t pad[3];
+		} unlink;
+
+	} u;
+
+	/* urb data segments */
+	struct usbif_request_segment seg[USBIF_MAX_SEGMENTS_PER_REQUEST];
+};
+typedef struct usbif_urb_request usbif_urb_request_t;
+
+struct usbif_urb_response {
+	uint16_t id; /* request id */
+	uint16_t start_frame;  /* start frame (ISO) */
+	int32_t status; /* status (non-ISO) */
+	int32_t actual_length; /* actual transfer length */
+	int32_t error_count; /* number of ISO errors */
+};
+typedef struct usbif_urb_response usbif_urb_response_t;
+
+DEFINE_RING_TYPES(usbif_urb, struct usbif_urb_request, struct usbif_urb_response);
+#define USB_URB_RING_SIZE __CONST_RING_SIZE(usbif_urb, PAGE_SIZE)
+
+/*
+ * RING for notifying connect/disconnect events to frontend
+ */
+struct usbif_conn_request {
+	uint16_t id;
+};
+typedef struct usbif_conn_request usbif_conn_request_t;
+
+struct usbif_conn_response {
+	uint16_t id; /* request id */
+	uint8_t portnum; /* port number */
+	uint8_t speed; /* usb_device_speed */
+};
+typedef struct usbif_conn_response usbif_conn_response_t;
+
+DEFINE_RING_TYPES(usbif_conn, struct usbif_conn_request, struct usbif_conn_response);
+#define USB_CONN_RING_SIZE __CONST_RING_SIZE(usbif_conn, PAGE_SIZE)
+
+#endif /* __XEN_PUBLIC_IO_USBIF_H__ */
diff --git a/include/xen/interface/io/vscsiif.h b/include/xen/interface/io/vscsiif.h
new file mode 100644
index 0000000..3ce2914
--- /dev/null
+++ b/include/xen/interface/io/vscsiif.h
@@ -0,0 +1,105 @@
+/******************************************************************************
+ * vscsiif.h
+ * 
+ * Based on the blkif.h code.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright(c) FUJITSU Limited 2008.
+ */
+
+#ifndef __XEN__PUBLIC_IO_SCSI_H__
+#define __XEN__PUBLIC_IO_SCSI_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+/* command between backend and frontend */
+#define VSCSIIF_ACT_SCSI_CDB         1    /* SCSI CDB command */
+#define VSCSIIF_ACT_SCSI_ABORT       2    /* SCSI Device(Lun) Abort*/
+#define VSCSIIF_ACT_SCSI_RESET       3    /* SCSI Device(Lun) Reset*/
+
+
+#define VSCSIIF_BACK_MAX_PENDING_REQS    128
+
+/*
+ * Maximum scatter/gather segments per request.
+ *
+ * Considering balance between allocating al least 16 "vscsiif_request"
+ * structures on one page (4096bytes) and number of scatter gather 
+ * needed, we decided to use 26 as a magic number.
+ */
+#define VSCSIIF_SG_TABLESIZE             26
+
+/*
+ * base on linux kernel 2.6.18
+ */
+#define VSCSIIF_MAX_COMMAND_SIZE         16
+#define VSCSIIF_SENSE_BUFFERSIZE         96
+
+
+struct vscsiif_request {
+    uint16_t rqid;          /* private guest value, echoed in resp  */
+    uint8_t act;            /* command between backend and frontend */
+    uint8_t cmd_len;
+
+    uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
+    uint16_t timeout_per_command;     /* The command is issued by twice 
+                                         the value in Backend. */
+    uint16_t channel, id, lun;
+    uint16_t padding;
+    uint8_t sc_data_direction;        /* for DMA_TO_DEVICE(1)
+                                         DMA_FROM_DEVICE(2)
+                                         DMA_NONE(3) requests  */
+    uint8_t nr_segments;              /* Number of pieces of scatter-gather */
+
+    struct scsiif_request_segment {
+        grant_ref_t gref;
+        uint16_t offset;
+        uint16_t length;
+    } seg[VSCSIIF_SG_TABLESIZE];
+    uint32_t reserved[3];
+};
+typedef struct vscsiif_request vscsiif_request_t;
+
+struct vscsiif_response {
+    uint16_t rqid;
+    uint8_t padding;
+    uint8_t sense_len;
+    uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+    int32_t rslt;
+    uint32_t residual_len;     /* request bufflen - 
+                                  return the value from physical device */
+    uint32_t reserved[36];
+};
+typedef struct vscsiif_response vscsiif_response_t;
+
+DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response);
+
+
+#endif  /*__XEN__PUBLIC_IO_SCSI_H__*/
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h
index 9fda532..b17afba 100644
--- a/include/xen/interface/io/xenbus.h
+++ b/include/xen/interface/io/xenbus.h
@@ -36,6 +36,7 @@ enum xenbus_state
 
 	XenbusStateReconfigured  = 8
 };
+typedef enum xenbus_state XenbusState;
 
 #endif /* _XEN_PUBLIC_IO_XENBUS_H */
 
diff --git a/include/xen/interface/io/xs_wire.h b/include/xen/interface/io/xs_wire.h
index 7cdfca2..debb025 100644
--- a/include/xen/interface/io/xs_wire.h
+++ b/include/xen/interface/io/xs_wire.h
@@ -1,6 +1,25 @@
 /*
  * Details of the "wire" protocol between Xen Store Daemon and client
  * library or guest kernel.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (C) 2005 Rusty Russell IBM Corporation
  */
 
@@ -29,7 +48,8 @@ enum xsd_sockmsg_type
     XS_IS_DOMAIN_INTRODUCED,
     XS_RESUME,
     XS_SET_TARGET,
-    XS_RESTRICT
+    XS_RESTRICT,
+    XS_RESET_WATCHES
 };
 
 #define XS_WRITE_NONE "NONE"
@@ -42,8 +62,14 @@ struct xsd_errors
     int errnum;
     const char *errstring;
 };
+#ifdef EINVAL
 #define XSD_ERROR(x) { x, #x }
-static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
+/* LINTED: static unused */
+static struct xsd_errors xsd_errors[]
+#if defined(__GNUC__)
+__attribute__((unused))
+#endif
+    = {
     XSD_ERROR(EINVAL),
     XSD_ERROR(EACCES),
     XSD_ERROR(EEXIST),
@@ -59,6 +85,7 @@ static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
     XSD_ERROR(EAGAIN),
     XSD_ERROR(EISCONN)
 };
+#endif
 
 struct xsd_sockmsg
 {
@@ -90,4 +117,8 @@ struct xenstore_domain_interface {
 /* Violating this is very bad.  See docs/misc/xenstore.txt. */
 #define XENSTORE_PAYLOAD_MAX 4096
 
+/* Violating these just gets you an error back */
+#define XENSTORE_ABS_PATH_MAX 3072
+#define XENSTORE_REL_PATH_MAX 2048
+
 #endif /* _XS_WIRE_H */
diff --git a/include/xen/interface/kexec.h b/include/xen/interface/kexec.h
new file mode 100644
index 0000000..0425222
--- /dev/null
+++ b/include/xen/interface/kexec.h
@@ -0,0 +1,168 @@
+/******************************************************************************
+ * kexec.h - Public portion
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ * 
+ * Xen port written by:
+ * - Simon 'Horms' Horman <horms@verge.net.au>
+ * - Magnus Damm <magnus@valinux.co.jp>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+
+/* This file describes the Kexec / Kdump hypercall interface for Xen.
+ *
+ * Kexec under vanilla Linux allows a user to reboot the physical machine 
+ * into a new user-specified kernel. The Xen port extends this idea
+ * to allow rebooting of the machine from dom0. When kexec for dom0
+ * is used to reboot,  both the hypervisor and the domains get replaced
+ * with some other kernel. It is possible to kexec between vanilla
+ * Linux and Xen and back again. Xen to Xen works well too.
+ *
+ * The hypercall interface for kexec can be divided into three main
+ * types of hypercall operations:
+ *
+ * 1) Range information:
+ *    This is used by the dom0 kernel to ask the hypervisor about various 
+ *    address information. This information is needed to allow kexec-tools 
+ *    to fill in the ELF headers for /proc/vmcore properly.
+ *
+ * 2) Load and unload of images:
+ *    There are no big surprises here, the kexec binary from kexec-tools
+ *    runs in userspace in dom0. The tool loads/unloads data into the
+ *    dom0 kernel such as new kernel, initramfs and hypervisor. When
+ *    loaded the dom0 kernel performs a load hypercall operation, and
+ *    before releasing all page references the dom0 kernel calls unload.
+ *
+ * 3) Kexec operation:
+ *    This is used to start a previously loaded kernel.
+ */
+
+#include "xen.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#define KEXEC_XEN_NO_PAGES 17
+#endif
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, void *args)
+ * @cmd  == KEXEC_CMD_... 
+ *          KEXEC operation to perform
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+/*
+ * Kexec supports two types of operation:
+ * - kexec into a regular kernel, very similar to a standard reboot
+ *   - KEXEC_TYPE_DEFAULT is used to specify this type
+ * - kexec into a special "crash kernel", aka kexec-on-panic
+ *   - KEXEC_TYPE_CRASH is used to specify this type
+ *   - parts of our system may be broken at kexec-on-panic time
+ *     - the code should be kept as simple and self-contained as possible
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+
+/* The kexec implementation for Xen allows the user to load two
+ * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
+ * All data needed for a kexec reboot is kept in one xen_kexec_image_t
+ * per "instance". The data mainly consists of machine address lists to pages
+ * together with destination addresses. The data in xen_kexec_image_t
+ * is passed to the "code page" which is one page of code that performs
+ * the final relocations before jumping to the new kernel.
+ */
+ 
+typedef struct xen_kexec_image {
+#if defined(__i386__) || defined(__x86_64__)
+    unsigned long page_list[KEXEC_XEN_NO_PAGES];
+#endif
+#if defined(__ia64__)
+    unsigned long reboot_code_buffer;
+#endif
+    unsigned long indirection_page;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropriate.
+ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ */
+#define KEXEC_CMD_kexec                 0
+typedef struct xen_kexec_exec {
+    int type;
+} xen_kexec_exec_t;
+
+/*
+ * Load/Unload kernel image for kexec or kdump.
+ * type  == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ * image == relocation information for kexec (ignored for unload) [in]
+ */
+#define KEXEC_CMD_kexec_load            1
+#define KEXEC_CMD_kexec_unload          2
+typedef struct xen_kexec_load {
+    int type;
+    xen_kexec_image_t image;
+} xen_kexec_load_t;
+
+#define KEXEC_RANGE_MA_CRASH      0 /* machine address and size of crash area */
+#define KEXEC_RANGE_MA_XEN        1 /* machine address and size of Xen itself */
+#define KEXEC_RANGE_MA_CPU        2 /* machine address and size of a CPU note */
+#define KEXEC_RANGE_MA_XENHEAP    3 /* machine address and size of xenheap
+                                     * Note that although this is adjacent
+                                     * to Xen it exists in a separate EFI
+                                     * region on ia64, and thus needs to be
+                                     * inserted into iomem_machine separately */
+#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of
+                                     * the ia64_boot_param */
+#define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of
+                                     * of the EFI Memory Map */
+#define KEXEC_RANGE_MA_VMCOREINFO 6 /* machine address and size of vmcoreinfo */
+
+/*
+ * Find the address and size of certain memory areas
+ * range == KEXEC_RANGE_... [in]
+ * nr    == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
+ * size  == number of bytes reserved in window [out]
+ * start == address of the first byte in the window [out]
+ */
+#define KEXEC_CMD_kexec_get_range       3
+typedef struct xen_kexec_range {
+    int range;
+    int nr;
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_range_t;
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/mem_event.h b/include/xen/interface/mem_event.h
new file mode 100644
index 0000000..770cc7c
--- /dev/null
+++ b/include/xen/interface/mem_event.h
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * mem_event.h
+ *
+ * Memory event common structures.
+ *
+ * Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _XEN_PUBLIC_MEM_EVENT_H
+#define _XEN_PUBLIC_MEM_EVENT_H
+
+#include "xen.h"
+#include "io/ring.h"
+
+/* Memory event type */
+#define MEM_EVENT_TYPE_SHARED   0
+#define MEM_EVENT_TYPE_PAGING   1
+#define MEM_EVENT_TYPE_ACCESS   2
+
+/* Memory event flags */
+#define MEM_EVENT_FLAG_VCPU_PAUSED  (1 << 0)
+#define MEM_EVENT_FLAG_DROP_PAGE    (1 << 1)
+#define MEM_EVENT_FLAG_EVICT_FAIL   (1 << 2)
+#define MEM_EVENT_FLAG_FOREIGN      (1 << 3)
+#define MEM_EVENT_FLAG_DUMMY        (1 << 4)
+
+/* Reasons for the memory event request */
+#define MEM_EVENT_REASON_UNKNOWN     0    /* typical reason */
+#define MEM_EVENT_REASON_VIOLATION   1    /* access violation, GFN is address */
+#define MEM_EVENT_REASON_CR0         2    /* CR0 was hit: gfn is CR0 value */
+#define MEM_EVENT_REASON_CR3         3    /* CR3 was hit: gfn is CR3 value */
+#define MEM_EVENT_REASON_CR4         4    /* CR4 was hit: gfn is CR4 value */
+#define MEM_EVENT_REASON_INT3        5    /* int3 was hit: gla/gfn are RIP */
+#define MEM_EVENT_REASON_SINGLESTEP  6    /* single step was invoked: gla/gfn are RIP */
+
+typedef struct mem_event_shared_page {
+    uint32_t port;
+} mem_event_shared_page_t;
+
+typedef struct mem_event_st {
+    uint16_t type;
+    uint16_t flags;
+    uint32_t vcpu_id;
+
+    uint64_t gfn;
+    uint64_t offset;
+    uint64_t gla; /* if gla_valid */
+
+    uint32_t p2mt;
+
+    uint16_t access_r:1;
+    uint16_t access_w:1;
+    uint16_t access_x:1;
+    uint16_t gla_valid:1;
+    uint16_t available:12;
+
+    uint16_t reason;
+} mem_event_request_t, mem_event_response_t;
+
+DEFINE_RING_TYPES(mem_event, mem_event_request_t, mem_event_response_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index eac3ce1..58b7865 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -3,13 +3,31 @@
  *
  * Memory reservation and information.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
  */
 
 #ifndef __XEN_PUBLIC_MEMORY_H__
 #define __XEN_PUBLIC_MEMORY_H__
 
-#include <linux/spinlock.h>
+#include "xen.h"
 
 /*
  * Increase or decrease the specified domain's memory reservation. Returns a
@@ -19,6 +37,26 @@
 #define XENMEM_increase_reservation 0
 #define XENMEM_decrease_reservation 1
 #define XENMEM_populate_physmap     6
+
+#if __XEN_INTERFACE_VERSION__ >= 0x00030209
+/*
+ * Maximum # bits addressable by the user of the allocated region (e.g., I/O
+ * devices often have a 32-bit limitation even in 64-bit systems). If zero
+ * then the user has no addressing restriction. This field is not used by
+ * XENMEM_decrease_reservation.
+ */
+#define XENMEMF_address_bits(x)     (x)
+#define XENMEMF_get_address_bits(x) ((x) & 0xffu)
+/* NUMA node to allocate from. */
+#define XENMEMF_node(x)     (((x) + 1) << 8)
+#define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu)
+/* Flag to populate physmap with populate-on-demand entries */
+#define XENMEMF_populate_on_demand (1<<16)
+/* Flag to request allocation only from the node specified */
+#define XENMEMF_exact_node_request  (1<<17)
+#define XENMEMF_exact_node(n) (XENMEMF_node(n) | XENMEMF_exact_node_request)
+#endif
+
 struct xen_memory_reservation {
 
     /*
@@ -31,12 +69,16 @@ struct xen_memory_reservation {
      *   OUT: GMFN bases of extents that were allocated
      *   (NB. This command also updates the mach_to_phys translation table)
      */
-    GUEST_HANDLE(ulong) extent_start;
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
 
     /* Number of extents, and size/alignment of each (2^extent_order pages). */
-    unsigned long  nr_extents;
+    xen_ulong_t    nr_extents;
     unsigned int   extent_order;
 
+#if __XEN_INTERFACE_VERSION__ >= 0x00030209
+    /* XENMEMF flags. */
+    unsigned int   mem_flags;
+#else
     /*
      * Maximum # bits addressable by the user of the allocated region (e.g.,
      * I/O devices often have a 32-bit limitation even in 64-bit systems). If
@@ -44,15 +86,17 @@ struct xen_memory_reservation {
      * This field is not used by XENMEM_decrease_reservation.
      */
     unsigned int   address_bits;
+#endif
 
     /*
      * Domain whose reservation is being changed.
      * Unprivileged domains can specify only DOMID_SELF.
      */
     domid_t        domid;
-
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
+typedef struct xen_memory_reservation xen_memory_reservation_t;
+DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
 
 /*
  * An atomic exchange of memory pages. If return code is zero then
@@ -92,10 +136,12 @@ struct xen_memory_exchange {
      *     command will be non-zero.
      *  5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
      */
-    unsigned long nr_exchanged;
+    xen_ulong_t nr_exchanged;
 };
-
 DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange);
+typedef struct xen_memory_exchange xen_memory_exchange_t;
+DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t);
+
 /*
  * Returns the maximum machine frame number of mapped RAM in this system.
  * This command always succeeds (it never returns an error code).
@@ -112,6 +158,11 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange);
 #define XENMEM_maximum_reservation  4
 
 /*
+ * Returns the maximum GPFN in use by the guest, or -ve errcode on failure.
+ */
+#define XENMEM_maximum_gpfn         14
+
+/*
  * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
  * mapping table. Architectures which do not have a m2p table do not implement
  * this command.
@@ -130,7 +181,7 @@ struct xen_machphys_mfn_list {
      * any large discontiguities in the machine address space, 2MB gaps in
      * the machphys table will be represented by an MFN base of zero.
      */
-    GUEST_HANDLE(ulong) extent_start;
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
 
     /*
      * Number of extents written to the above array. This will be smaller
@@ -139,6 +190,8 @@ struct xen_machphys_mfn_list {
     unsigned int nr_extents;
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
+typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
+DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
 
 /*
  * Returns the location in virtual address space of the machine_to_phys
@@ -148,10 +201,12 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
  */
 #define XENMEM_machphys_mapping     12
 struct xen_machphys_mapping {
-    unsigned long v_start, v_end; /* Start and end virtual addresses.   */
-    unsigned long max_mfn;        /* Maximum MFN that can be looked up. */
+    xen_ulong_t v_start, v_end; /* Start and end virtual addresses.   */
+    xen_ulong_t max_mfn;        /* Maximum MFN that can be looked up. */
 };
-DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping_t);
+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping);
+typedef struct xen_machphys_mapping xen_machphys_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t);
 
 /*
  * Sets the GPFN at which a particular page appears in the specified guest's
@@ -163,41 +218,46 @@ struct xen_add_to_physmap {
     /* Which domain to change the mapping for. */
     domid_t domid;
 
+    /* Number of pages to go through for gmfn_range */
+    uint16_t    size;
+
     /* Source mapping space. */
 #define XENMAPSPACE_shared_info 0 /* shared info page */
 #define XENMAPSPACE_grant_table 1 /* grant table page */
+#define XENMAPSPACE_gmfn        2 /* GMFN */
+#define XENMAPSPACE_gmfn_range  3 /* GMFN range */
     unsigned int space;
 
+#define XENMAPIDX_grant_table_status 0x80000000
+
     /* Index into source mapping space. */
-    unsigned long idx;
+    xen_ulong_t idx;
 
     /* GPFN where the source mapping page should appear. */
-    unsigned long gpfn;
+    xen_pfn_t     gpfn;
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
+typedef struct xen_add_to_physmap xen_add_to_physmap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
 
 /*
- * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
- * code on failure. This call only works for auto-translated guests.
+ * Unmaps the page appearing at a particular GPFN from the specified guest's
+ * pseudophysical address space.
+ * arg == addr of xen_remove_from_physmap_t.
  */
-#define XENMEM_translate_gpfn_list  8
-struct xen_translate_gpfn_list {
-    /* Which domain to translate for? */
+#define XENMEM_remove_from_physmap      15
+struct xen_remove_from_physmap {
+    /* Which domain to change the mapping for. */
     domid_t domid;
 
-    /* Length of list. */
-    unsigned long nr_gpfns;
-
-    /* List of GPFNs to translate. */
-    GUEST_HANDLE(ulong) gpfn_list;
-
-    /*
-     * Output list to contain MFN translations. May be the same as the input
-     * list (in which case each input GPFN is overwritten with the output MFN).
-     */
-    GUEST_HANDLE(ulong) mfn_list;
+    /* GPFN of the current mapping of the page. */
+    xen_pfn_t     gpfn;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
+typedef struct xen_remove_from_physmap xen_remove_from_physmap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_remove_from_physmap_t);
+
+/*** REMOVED ***/
+/*#define XENMEM_translate_gpfn_list  8*/
 
 /*
  * Returns the pseudo-physical memory map as it was when the domain
@@ -217,9 +277,11 @@ struct xen_memory_map {
      * Entries in the buffer are in the same format as returned by the
      * BIOS INT 0x15 EAX=0xE820 call.
      */
-    GUEST_HANDLE(void) buffer;
+    XEN_GUEST_HANDLE(void) buffer;
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
+typedef struct xen_memory_map xen_memory_map_t;
+DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t);
 
 /*
  * Returns the real physical memory map. Passes the same structure as
@@ -228,10 +290,48 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
  */
 #define XENMEM_machine_memory_map   10
 
+/*
+ * Set the pseudo-physical memory map of a domain, as returned by
+ * XENMEM_memory_map.
+ * arg == addr of xen_foreign_memory_map_t.
+ */
+#define XENMEM_set_memory_map       13
+struct xen_foreign_memory_map {
+    domid_t domid;
+    struct xen_memory_map map;
+};
+typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
+DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
+
+#define XENMEM_set_pod_target       16
+#define XENMEM_get_pod_target       17
+struct xen_pod_target {
+    /* IN */
+    uint64_t target_pages;
+    /* OUT */
+    uint64_t tot_pages;
+    uint64_t pod_cache_pages;
+    uint64_t pod_entries;
+    /* IN */
+    domid_t domid;
+};
+typedef struct xen_pod_target xen_pod_target_t;
+
+/*
+ * Get the number of MFNs saved through memory sharing.
+ * The call never fails.
+ */
+#define XENMEM_get_sharing_freed_pages    18
+#define XENMEM_get_sharing_shared_pages   19
+
+#ifndef CONFIG_XEN
+#include <linux/spinlock.h>
 
 /*
  * Prevent the balloon driver from changing the memory reservation
  * during a driver critical region.
  */
 extern spinlock_t xen_reservation_lock;
+#endif
+
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
diff --git a/include/xen/interface/nmi.h b/include/xen/interface/nmi.h
new file mode 100644
index 0000000..2fd21d2
--- /dev/null
+++ b/include/xen/interface/nmi.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * nmi.h
+ * 
+ * NMI callback registration and reason codes.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+
+#ifndef __XEN_PUBLIC_NMI_H__
+#define __XEN_PUBLIC_NMI_H__
+
+#include "xen.h"
+
+/*
+ * NMI reason codes:
+ * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
+ */
+ /* I/O-check error reported via ISA port 0x61, bit 6. */
+#define _XEN_NMIREASON_io_error     0
+#define XEN_NMIREASON_io_error      (1UL << _XEN_NMIREASON_io_error)
+ /* Parity error reported via ISA port 0x61, bit 7. */
+#define _XEN_NMIREASON_parity_error 1
+#define XEN_NMIREASON_parity_error  (1UL << _XEN_NMIREASON_parity_error)
+ /* Unknown hardware-generated NMI. */
+#define _XEN_NMIREASON_unknown      2
+#define XEN_NMIREASON_unknown       (1UL << _XEN_NMIREASON_unknown)
+
+/*
+ * long nmi_op(unsigned int cmd, void *arg)
+ * NB. All ops return zero on success, else a negative error code.
+ */
+
+/*
+ * Register NMI callback for this (calling) VCPU. Currently this only makes
+ * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
+ * arg == pointer to xennmi_callback structure.
+ */
+#define XENNMI_register_callback   0
+struct xennmi_callback {
+    unsigned long handler_address;
+    unsigned long pad;
+};
+typedef struct xennmi_callback xennmi_callback_t;
+DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t);
+
+/*
+ * Deregister NMI callback for this (calling) VCPU.
+ * arg == NULL.
+ */
+#define XENNMI_unregister_callback 1
+
+#endif /* __XEN_PUBLIC_NMI_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
index c1080d9..e3b516d 100644
--- a/include/xen/interface/physdev.h
+++ b/include/xen/interface/physdev.h
@@ -21,6 +21,8 @@
 #ifndef __XEN_PUBLIC_PHYSDEV_H__
 #define __XEN_PUBLIC_PHYSDEV_H__
 
+#include "xen.h"
+
 /*
  * Prototype for this hypercall is:
  *  int physdev_op(int cmd, void *args)
@@ -37,6 +39,31 @@ struct physdev_eoi {
 	/* IN */
 	uint32_t irq;
 };
+typedef struct physdev_eoi physdev_eoi_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t);
+
+/*
+ * Register a shared page for the hypervisor to indicate whether the guest
+ * must issue PHYSDEVOP_eoi. The semantics of PHYSDEVOP_eoi change slightly
+ * once the guest used this function in that the associated event channel
+ * will automatically get unmasked. The page registered is used as a bit
+ * array indexed by Xen's PIRQ value.
+ */
+#define PHYSDEVOP_pirq_eoi_gmfn_v1       17
+/*
+ * Register a shared page for the hypervisor to indicate whether the
+ * guest must issue PHYSDEVOP_eoi. This hypercall is very similar to
+ * PHYSDEVOP_pirq_eoi_gmfn_v1 but it doesn't change the semantics of
+ * PHYSDEVOP_eoi. The page registered is used as a bit array indexed by
+ * Xen's PIRQ value.
+ */
+#define PHYSDEVOP_pirq_eoi_gmfn_v2       28
+struct physdev_pirq_eoi_gmfn {
+    /* IN */
+    xen_pfn_t gmfn;
+};
+typedef struct physdev_pirq_eoi_gmfn physdev_pirq_eoi_gmfn_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pirq_eoi_gmfn_t);
 
 /*
  * Query the status of an IRQ line.
@@ -49,6 +76,8 @@ struct physdev_irq_status_query {
 	/* OUT */
 	uint32_t flags; /* XENIRQSTAT_* */
 };
+typedef struct physdev_irq_status_query physdev_irq_status_query_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t);
 
 /* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
 #define _XENIRQSTAT_needs_eoi	(0)
@@ -67,6 +96,8 @@ struct physdev_set_iopl {
 	/* IN */
 	uint32_t iopl;
 };
+typedef struct physdev_set_iopl physdev_set_iopl_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t);
 
 /*
  * Set the current VCPU's I/O-port permissions bitmap.
@@ -75,9 +106,15 @@ struct physdev_set_iopl {
 #define PHYSDEVOP_set_iobitmap		 7
 struct physdev_set_iobitmap {
 	/* IN */
+#if __XEN_INTERFACE_VERSION__ >= 0x00030205
+	XEN_GUEST_HANDLE(uint8) bitmap;
+#else
 	uint8_t * bitmap;
+#endif
 	uint32_t nr_ports;
 };
+typedef struct physdev_set_iobitmap physdev_set_iobitmap_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t);
 
 /*
  * Read or write an IO-APIC register.
@@ -92,6 +129,8 @@ struct physdev_apic {
 	/* IN or OUT */
 	uint32_t value;
 };
+typedef struct physdev_apic physdev_apic_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_apic_t);
 
 /*
  * Allocate or free a physical upcall vector for the specified IRQ line.
@@ -105,6 +144,8 @@ struct physdev_irq {
 	/* IN or OUT */
 	uint32_t vector;
 };
+typedef struct physdev_irq physdev_irq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
 
 #define MAP_PIRQ_TYPE_MSI		0x0
 #define MAP_PIRQ_TYPE_GSI		0x1
@@ -129,6 +170,8 @@ struct physdev_map_pirq {
     /* IN */
     uint64_t table_base;
 };
+typedef struct physdev_map_pirq physdev_map_pirq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_map_pirq_t);
 
 #define PHYSDEVOP_unmap_pirq		14
 struct physdev_unmap_pirq {
@@ -136,6 +179,8 @@ struct physdev_unmap_pirq {
     /* IN */
     int pirq;
 };
+typedef struct physdev_unmap_pirq physdev_unmap_pirq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_unmap_pirq_t);
 
 #define PHYSDEVOP_manage_pci_add	15
 #define PHYSDEVOP_manage_pci_remove	16
@@ -144,6 +189,17 @@ struct physdev_manage_pci {
 	uint8_t bus;
 	uint8_t devfn;
 };
+typedef struct physdev_manage_pci physdev_manage_pci_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_t);
+
+#define PHYSDEVOP_restore_msi            19
+struct physdev_restore_msi {
+    /* IN */
+    uint8_t bus;
+    uint8_t devfn;
+};
+typedef struct physdev_restore_msi physdev_restore_msi_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_restore_msi_t);
 
 #define PHYSDEVOP_manage_pci_add_ext	20
 struct physdev_manage_pci_ext {
@@ -157,6 +213,8 @@ struct physdev_manage_pci_ext {
 		uint8_t devfn;
 	} physfn;
 };
+typedef struct physdev_manage_pci_ext physdev_manage_pci_ext_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_ext_t);
 
 /*
  * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
@@ -172,6 +230,8 @@ struct physdev_op {
 		struct physdev_irq		     irq_op;
 	} u;
 };
+typedef struct physdev_op physdev_op_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_op_t);
 
 #define PHYSDEVOP_setup_gsi    21
 struct physdev_setup_gsi {
@@ -182,12 +242,10 @@ struct physdev_setup_gsi {
     uint8_t polarity;
     /* IN */
 };
+typedef struct physdev_setup_gsi physdev_setup_gsi_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_setup_gsi_t);
 
-#define PHYSDEVOP_get_nr_pirqs    22
-struct physdev_nr_pirqs {
-    /* OUT */
-    uint32_t nr_pirqs;
-};
+/* leave PHYSDEVOP 22 free */
 
 /* type is MAP_PIRQ_TYPE_GSI or MAP_PIRQ_TYPE_MSI
  * the hypercall returns a free pirq */
@@ -198,6 +256,21 @@ struct physdev_get_free_pirq {
     /* OUT */
     uint32_t pirq;
 };
+typedef struct physdev_get_free_pirq physdev_get_free_pirq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_get_free_pirq_t);
+
+#define XEN_PCI_MMCFG_RESERVED         0x1
+
+#define PHYSDEVOP_pci_mmcfg_reserved    24
+struct physdev_pci_mmcfg_reserved {
+    uint64_t address;
+    uint16_t segment;
+    uint8_t start_bus;
+    uint8_t end_bus;
+    uint32_t flags;
+};
+typedef struct physdev_pci_mmcfg_reserved physdev_pci_mmcfg_reserved_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pci_mmcfg_reserved_t);
 
 #define XEN_PCI_DEV_EXTFN              0x1
 #define XEN_PCI_DEV_VIRTFN             0x2
@@ -220,6 +293,8 @@ struct physdev_pci_device_add {
     uint32_t optarr[0];
 #endif
 };
+typedef struct physdev_pci_device_add physdev_pci_device_add_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_add_t);
 
 #define PHYSDEVOP_pci_device_remove     26
 #define PHYSDEVOP_restore_msi_ext       27
@@ -229,6 +304,8 @@ struct physdev_pci_device {
     uint8_t bus;
     uint8_t devfn;
 };
+typedef struct physdev_pci_device physdev_pci_device_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_t);
 
 /*
  * Notify that some PIRQ-bound event channels have been unmasked.
@@ -251,4 +328,10 @@ struct physdev_pci_device {
 #define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
 #define PHYSDEVOP_IRQ_SHARED		 XENIRQSTAT_shared
 
+#if __XEN_INTERFACE_VERSION__ < 0x00040200
+#define PHYSDEVOP_pirq_eoi_gmfn PHYSDEVOP_pirq_eoi_gmfn_v1
+#else
+#define PHYSDEVOP_pirq_eoi_gmfn PHYSDEVOP_pirq_eoi_gmfn_v2
+#endif
+
 #endif /* __XEN_PUBLIC_PHYSDEV_H__ */
diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h
index c168468..0dfba21 100644
--- a/include/xen/interface/platform.h
+++ b/include/xen/interface/platform.h
@@ -42,7 +42,8 @@ struct xenpf_settime {
 	uint32_t nsecs;
 	uint64_t system_time;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t);
+typedef struct xenpf_settime xenpf_settime_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t);
 
 /*
  * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
@@ -54,14 +55,15 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t);
 #define XENPF_add_memtype         31
 struct xenpf_add_memtype {
 	/* IN variables. */
-	unsigned long mfn;
+	xen_pfn_t mfn;
 	uint64_t nr_mfns;
 	uint32_t type;
 	/* OUT variables. */
 	uint32_t handle;
 	uint32_t reg;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_add_memtype_t);
+typedef struct xenpf_add_memtype xenpf_add_memtype_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_add_memtype_t);
 
 /*
  * Tear down an existing memory-range type. If @handle is remembered then it
@@ -76,7 +78,8 @@ struct xenpf_del_memtype {
 	uint32_t handle;
 	uint32_t reg;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_del_memtype_t);
+typedef struct xenpf_del_memtype xenpf_del_memtype_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_del_memtype_t);
 
 /* Read current type of an MTRR (x86-specific). */
 #define XENPF_read_memtype        33
@@ -84,19 +87,21 @@ struct xenpf_read_memtype {
 	/* IN variables. */
 	uint32_t reg;
 	/* OUT variables. */
-	unsigned long mfn;
+	xen_pfn_t mfn;
 	uint64_t nr_mfns;
 	uint32_t type;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_read_memtype_t);
+typedef struct xenpf_read_memtype xenpf_read_memtype_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_read_memtype_t);
 
 #define XENPF_microcode_update    35
 struct xenpf_microcode_update {
 	/* IN variables. */
-	GUEST_HANDLE(void) data;          /* Pointer to microcode data */
+	XEN_GUEST_HANDLE(const_void) data;/* Pointer to microcode data */
 	uint32_t length;                  /* Length of microcode data. */
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_microcode_update_t);
+typedef struct xenpf_microcode_update xenpf_microcode_update_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_microcode_update_t);
 
 #define XENPF_platform_quirk      39
 #define QUIRK_NOIRQBALANCING      1 /* Do not restrict IO-APIC RTE targets */
@@ -106,12 +111,113 @@ struct xenpf_platform_quirk {
 	/* IN variables. */
 	uint32_t quirk_id;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_platform_quirk_t);
+typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t);
+
+#define XENPF_efi_runtime_call    49
+#define XEN_EFI_get_time                      1
+#define XEN_EFI_set_time                      2
+#define XEN_EFI_get_wakeup_time               3
+#define XEN_EFI_set_wakeup_time               4
+#define XEN_EFI_get_next_high_monotonic_count 5
+#define XEN_EFI_get_variable                  6
+#define XEN_EFI_set_variable                  7
+#define XEN_EFI_get_next_variable_name        8
+#define XEN_EFI_query_variable_info           9
+#define XEN_EFI_query_capsule_capabilities   10
+#define XEN_EFI_update_capsule               11
+struct xenpf_efi_runtime_call {
+    uint32_t function;
+    /*
+     * This field is generally used for per sub-function flags (defined
+     * below), except for the XEN_EFI_get_next_high_monotonic_count case,
+     * where it holds the single returned value.
+     */
+    uint32_t misc;
+    unsigned long status;
+    union {
+#define XEN_EFI_GET_TIME_SET_CLEARS_NS 0x00000001
+        struct {
+            struct xenpf_efi_time {
+                uint16_t year;
+                uint8_t month;
+                uint8_t day;
+                uint8_t hour;
+                uint8_t min;
+                uint8_t sec;
+                uint32_t ns;
+                int16_t tz;
+                uint8_t daylight;
+            } time;
+            uint32_t resolution;
+            uint32_t accuracy;
+        } get_time;
+
+        struct xenpf_efi_time set_time;
+
+#define XEN_EFI_GET_WAKEUP_TIME_ENABLED 0x00000001
+#define XEN_EFI_GET_WAKEUP_TIME_PENDING 0x00000002
+        struct xenpf_efi_time get_wakeup_time;
+
+#define XEN_EFI_SET_WAKEUP_TIME_ENABLE      0x00000001
+#define XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY 0x00000002
+        struct xenpf_efi_time set_wakeup_time;
+
+#define XEN_EFI_VARIABLE_NON_VOLATILE       0x00000001
+#define XEN_EFI_VARIABLE_BOOTSERVICE_ACCESS 0x00000002
+#define XEN_EFI_VARIABLE_RUNTIME_ACCESS     0x00000004
+        struct {
+            XEN_GUEST_HANDLE(void) name;  /* UCS-2/UTF-16 string */
+            unsigned long size;
+            XEN_GUEST_HANDLE(void) data;
+            struct xenpf_efi_guid {
+                uint32_t data1;
+                uint16_t data2;
+                uint16_t data3;
+                uint8_t data4[8];
+            } vendor_guid;
+        } get_variable, set_variable;
+
+        struct {
+            unsigned long size;
+            XEN_GUEST_HANDLE(void) name;  /* UCS-2/UTF-16 string */
+            struct xenpf_efi_guid vendor_guid;
+        } get_next_variable_name;
+
+        struct {
+            uint32_t attr;
+            uint64_t max_store_size;
+            uint64_t remain_store_size;
+            uint64_t max_size;
+        } query_variable_info;
+
+        struct {
+            XEN_GUEST_HANDLE(void) capsule_header_array;
+            unsigned long capsule_count;
+            uint64_t max_capsule_size;
+            unsigned int reset_type;
+        } query_capsule_capabilities;
+
+        struct {
+            XEN_GUEST_HANDLE(void) capsule_header_array;
+            unsigned long capsule_count;
+            uint64_t sg_list; /* machine address */
+        } update_capsule;
+    } u;
+};
+typedef struct xenpf_efi_runtime_call xenpf_efi_runtime_call_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_efi_runtime_call_t);
 
 #define XENPF_firmware_info       50
 #define XEN_FW_DISK_INFO          1 /* from int 13 AH=08/41/48 */
 #define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */
 #define XEN_FW_VBEDDC_INFO        3 /* from int 10 AX=4f15 */
+#define XEN_FW_EFI_INFO           4 /* from EFI */
+#define  XEN_FW_EFI_VERSION        0
+#define  XEN_FW_EFI_CONFIG_TABLE   1
+#define  XEN_FW_EFI_VENDOR         2
+#define  XEN_FW_EFI_MEM_INFO       3
+#define  XEN_FW_EFI_RT_VERSION     4
 struct xenpf_firmware_info {
 	/* IN variables. */
 	uint32_t type;
@@ -129,7 +235,7 @@ struct xenpf_firmware_info {
 			uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector #  */
 			/* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */
 			/* NB. First uint16_t of buffer must be set to buffer size.      */
-			GUEST_HANDLE(void) edd_params;
+			XEN_GUEST_HANDLE(void) edd_params;
 		} disk_info; /* XEN_FW_DISK_INFO */
 		struct {
 			uint8_t device;                   /* bios device number  */
@@ -140,11 +246,30 @@ struct xenpf_firmware_info {
 			uint8_t capabilities;
 			uint8_t edid_transfer_time;
 			/* must refer to 128-byte buffer */
-			GUEST_HANDLE(uchar) edid;
+			XEN_GUEST_HANDLE(uint8) edid;
 		} vbeddc_info; /* XEN_FW_VBEDDC_INFO */
+        union xenpf_efi_info {
+            uint32_t version;
+            struct {
+                uint64_t addr;                /* EFI_CONFIGURATION_TABLE */
+                uint32_t nent;
+            } cfg;
+            struct {
+                uint32_t revision;
+                uint32_t bufsz;               /* input, in bytes */
+                XEN_GUEST_HANDLE(void) name;  /* UCS-2/UTF-16 string */
+            } vendor;
+            struct {
+                uint64_t addr;
+                uint64_t size;
+                uint64_t attr;
+                uint32_t type;
+            } mem;
+        } efi_info; /* XEN_FW_EFI_INFO */
 	} u;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_firmware_info_t);
+typedef struct xenpf_firmware_info xenpf_firmware_info_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_firmware_info_t);
 
 #define XENPF_enter_acpi_sleep    51
 struct xenpf_enter_acpi_sleep {
@@ -154,7 +279,8 @@ struct xenpf_enter_acpi_sleep {
 	uint32_t sleep_state;       /* Which state to enter (Sn). */
 	uint32_t flags;             /* Must be zero. */
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_enter_acpi_sleep_t);
+typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_enter_acpi_sleep_t);
 
 #define XENPF_change_freq         52
 struct xenpf_change_freq {
@@ -163,7 +289,8 @@ struct xenpf_change_freq {
 	uint32_t cpu;   /* Physical cpu. */
 	uint64_t freq;  /* New frequency (Hz). */
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_change_freq_t);
+typedef struct xenpf_change_freq xenpf_change_freq_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_change_freq_t);
 
 /*
  * Get idle times (nanoseconds since boot) for physical CPUs specified in the
@@ -177,17 +304,18 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_change_freq_t);
 struct xenpf_getidletime {
 	/* IN/OUT variables */
 	/* IN: CPUs to interrogate; OUT: subset of IN which are present */
-	GUEST_HANDLE(uchar) cpumap_bitmap;
+	XEN_GUEST_HANDLE(uint8) cpumap_bitmap;
 	/* IN variables */
 	/* Size of cpumap bitmap. */
 	uint32_t cpumap_nr_cpus;
 	/* Must be indexable for every cpu in cpumap_bitmap. */
-	GUEST_HANDLE(uint64_t) idletime;
+	XEN_GUEST_HANDLE(uint64) idletime;
 	/* OUT variables */
 	/* System time when the idletime snapshots were taken. */
 	uint64_t now;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t);
+typedef struct xenpf_getidletime xenpf_getidletime_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
 
 #define XENPF_set_processor_pminfo      54
 
@@ -200,6 +328,7 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t);
 #define XEN_PM_CX   0
 #define XEN_PM_PX   1
 #define XEN_PM_TX   2
+#define XEN_PM_PDC  3
 
 /* Px sub info type */
 #define XEN_PX_PCT   1
@@ -221,6 +350,8 @@ struct xen_processor_csd {
 	uint32_t    num;         /* number of processors in same domain */
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_processor_csd);
+typedef struct xen_processor_csd xen_processor_csd_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t);
 
 struct xen_processor_cx {
 	struct xen_power_register  reg; /* GAS for Cx trigger register */
@@ -228,9 +359,11 @@ struct xen_processor_cx {
 	uint32_t    latency;  /* worst latency (ms) to enter/exit this cstate */
 	uint32_t    power;    /* average power consumption(mW) */
 	uint32_t    dpcnt;    /* number of dependency entries */
-	GUEST_HANDLE(xen_processor_csd) dp; /* NULL if no dependency */
+	XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_processor_cx);
+typedef struct xen_processor_cx xen_processor_cx_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t);
 
 struct xen_processor_flags {
 	uint32_t bm_control:1;
@@ -243,7 +376,7 @@ struct xen_processor_flags {
 struct xen_processor_power {
 	uint32_t count;  /* number of C state entries in array below */
 	struct xen_processor_flags flags;  /* global flags of this processor */
-	GUEST_HANDLE(xen_processor_cx) states; /* supported c states */
+	XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */
 };
 
 struct xen_pct_register {
@@ -261,10 +394,12 @@ struct xen_processor_px {
 	uint64_t power;      /* milliWatts */
 	uint64_t transition_latency; /* microseconds */
 	uint64_t bus_master_latency; /* microseconds */
-	uint64_t control;        /* control value */
-	uint64_t status;     /* success indicator */
+   	uint64_t control;        /* control value */
+   	uint64_t status;     /* success indicator */
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_processor_px);
+typedef struct xen_processor_px xen_processor_px_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_px_t);
 
 struct xen_psd_package {
 	uint64_t num_entries;
@@ -280,11 +415,13 @@ struct xen_processor_performance {
 	struct xen_pct_register control_register;
 	struct xen_pct_register status_register;
 	uint32_t state_count;     /* total available performance states */
-	GUEST_HANDLE(xen_processor_px) states;
+	XEN_GUEST_HANDLE(xen_processor_px_t) states;
 	struct xen_psd_package domain_info;
 	uint32_t shared_type;     /* coordination type of this processor */
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_processor_performance);
+typedef struct xen_processor_performance xen_processor_performance_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_performance_t);
 
 struct xenpf_set_processor_pminfo {
 	/* IN variables */
@@ -293,9 +430,84 @@ struct xenpf_set_processor_pminfo {
 	union {
 		struct xen_processor_power          power;/* Cx: _CST/_CSD */
 		struct xen_processor_performance    perf; /* Px: _PPC/_PCT/_PSS/_PSD */
-	};
+		XEN_GUEST_HANDLE(uint32)            pdc;  /* _PDC */
+#ifdef CONFIG_XEN
+	} u;
+#else
+     };
+#endif
 };
 DEFINE_GUEST_HANDLE_STRUCT(xenpf_set_processor_pminfo);
+typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t);
+
+#define XENPF_get_cpuinfo 55
+struct xenpf_pcpuinfo {
+    /* IN */
+    uint32_t xen_cpuid;
+    /* OUT */
+    /* The maxium cpu_id that is present */
+    uint32_t max_present;
+#define XEN_PCPU_FLAGS_ONLINE   1
+    /* Correponding xen_cpuid is not present*/
+#define XEN_PCPU_FLAGS_INVALID  2
+    uint32_t flags;
+    uint32_t apic_id;
+    uint32_t acpi_id;
+};
+typedef struct xenpf_pcpuinfo xenpf_pcpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_pcpuinfo_t);
+
+#define XENPF_get_cpu_version 48
+struct xenpf_pcpu_version {
+    /* IN */
+    uint32_t xen_cpuid;
+    /* OUT */
+    /* The maxium cpu_id that is present */
+    uint32_t max_present;
+    char vendor_id[12];
+    uint32_t family;
+    uint32_t model;
+    uint32_t stepping;
+};
+typedef struct xenpf_pcpu_version xenpf_pcpu_version_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_pcpu_version_t);
+
+#define XENPF_cpu_online    56
+#define XENPF_cpu_offline   57
+struct xenpf_cpu_ol
+{
+    uint32_t cpuid;
+};
+typedef struct xenpf_cpu_ol xenpf_cpu_ol_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_cpu_ol_t);
+
+#define XENPF_cpu_hotadd    58
+struct xenpf_cpu_hotadd
+{
+	uint32_t apic_id;
+	uint32_t acpi_id;
+	uint32_t pxm;
+};
+
+#define XENPF_mem_hotadd    59
+struct xenpf_mem_hotadd
+{
+    uint64_t spfn;
+    uint64_t epfn;
+    uint32_t pxm;
+    uint32_t flags;
+};
+
+#define XENPF_get_cpu_freq        ('N' << 24)
+#define XENPF_get_cpu_freq_min    (XENPF_get_cpu_freq + 1)
+#define XENPF_get_cpu_freq_max    (XENPF_get_cpu_freq_min + 1)
+struct xenpf_get_cpu_freq {
+    /* IN variables */
+    uint32_t vcpu;
+    /* OUT variables */
+    uint32_t freq; /* in kHz */
+};
 
 struct xen_platform_op {
 	uint32_t cmd;
@@ -307,14 +519,22 @@ struct xen_platform_op {
 		struct xenpf_read_memtype      read_memtype;
 		struct xenpf_microcode_update  microcode;
 		struct xenpf_platform_quirk    platform_quirk;
+		struct xenpf_efi_runtime_call  efi_runtime_call;
 		struct xenpf_firmware_info     firmware_info;
 		struct xenpf_enter_acpi_sleep  enter_acpi_sleep;
 		struct xenpf_change_freq       change_freq;
 		struct xenpf_getidletime       getidletime;
 		struct xenpf_set_processor_pminfo set_pminfo;
+		struct xenpf_pcpuinfo          pcpu_info;
+		struct xenpf_pcpu_version      pcpu_version;
+		struct xenpf_cpu_ol            cpu_ol;
+		struct xenpf_cpu_hotadd        cpu_add;
+		struct xenpf_mem_hotadd        mem_add;
+		struct xenpf_get_cpu_freq      get_cpu_freq;
 		uint8_t                        pad[128];
 	} u;
 };
-DEFINE_GUEST_HANDLE_STRUCT(xen_platform_op_t);
+typedef struct xen_platform_op xen_platform_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_platform_op_t);
 
 #endif /* __XEN_PUBLIC_PLATFORM_H__ */
diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h
index dd55dac..3531702 100644
--- a/include/xen/interface/sched.h
+++ b/include/xen/interface/sched.h
@@ -3,6 +3,24 @@
  *
  * Scheduler state interactions
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
  */
 
@@ -13,17 +31,17 @@
 
 /*
  * The prototype for this hypercall is:
- *  long sched_op_new(int cmd, void *arg)
+ *  long sched_op(int cmd, void *arg)
  * @cmd == SCHEDOP_??? (scheduler operation).
  * @arg == Operation-specific extra argument(s), as described below.
  *
- * **NOTE**:
  * Versions of Xen prior to 3.0.2 provide only the following legacy version
  * of this hypercall, supporting only the commands yield, block and shutdown:
  *  long sched_op(int cmd, unsigned long arg)
  * @cmd == SCHEDOP_??? (scheduler operation).
  * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
  *      == SHUTDOWN_* code (SCHEDOP_shutdown)
+ * This legacy version is available to new guests as sched_op_compat().
  */
 
 /*
@@ -50,6 +68,8 @@ struct sched_shutdown {
     unsigned int reason; /* SHUTDOWN_* */
 };
 DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
+typedef struct sched_shutdown sched_shutdown_t;
+DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
 
 /*
  * Poll a set of event-channel ports. Return when one or more are pending. An
@@ -58,11 +78,13 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
  */
 #define SCHEDOP_poll        3
 struct sched_poll {
-    GUEST_HANDLE(evtchn_port_t) ports;
+    XEN_GUEST_HANDLE(evtchn_port_t) ports;
     unsigned int nr_ports;
     uint64_t timeout;
 };
 DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
+typedef struct sched_poll sched_poll_t;
+DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
 
 /*
  * Declare a shutdown for another domain. The main use of this function is
@@ -75,6 +97,8 @@ struct sched_remote_shutdown {
     domid_t domain_id;         /* Remote domain ID */
     unsigned int reason;       /* SHUTDOWN_xxx reason */
 };
+typedef struct sched_remote_shutdown sched_remote_shutdown_t;
+DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
 
 /*
  * Latch a shutdown code, so that when the domain later shuts down it
@@ -96,6 +120,8 @@ struct sched_watchdog {
     uint32_t id;                /* watchdog ID */
     uint32_t timeout;           /* timeout */
 };
+typedef struct sched_watchdog sched_watchdog_t;
+DEFINE_XEN_GUEST_HANDLE(sched_watchdog_t);
 
 /*
  * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
diff --git a/include/xen/interface/sysctl.h b/include/xen/interface/sysctl.h
new file mode 100644
index 0000000..e717cdc
--- /dev/null
+++ b/include/xen/interface/sysctl.h
@@ -0,0 +1,640 @@
+/******************************************************************************
+ * sysctl.h
+ * 
+ * System management operations. For use by node control stack.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_SYSCTL_H__
+#define __XEN_PUBLIC_SYSCTL_H__
+
+#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
+#error "sysctl operations are intended for use by node control tools only"
+#endif
+
+#include "xen.h"
+#include "domctl.h"
+
+#define XEN_SYSCTL_INTERFACE_VERSION 0x00000009
+
+/*
+ * Read console content from Xen buffer ring.
+ */
+/* XEN_SYSCTL_readconsole */
+struct xen_sysctl_readconsole {
+    /* IN: Non-zero -> clear after reading. */
+    uint8_t clear;
+    /* IN: Non-zero -> start index specified by @index field. */
+    uint8_t incremental;
+    uint8_t pad0, pad1;
+    /*
+     * IN:  Start index for consuming from ring buffer (if @incremental);
+     * OUT: End index after consuming from ring buffer.
+     */
+    uint32_t index; 
+    /* IN: Virtual address to write console data. */
+    XEN_GUEST_HANDLE_64(char) buffer;
+    /* IN: Size of buffer; OUT: Bytes written to buffer. */
+    uint32_t count;
+};
+typedef struct xen_sysctl_readconsole xen_sysctl_readconsole_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_readconsole_t);
+
+/* Get trace buffers machine base address */
+/* XEN_SYSCTL_tbuf_op */
+struct xen_sysctl_tbuf_op {
+    /* IN variables */
+#define XEN_SYSCTL_TBUFOP_get_info     0
+#define XEN_SYSCTL_TBUFOP_set_cpu_mask 1
+#define XEN_SYSCTL_TBUFOP_set_evt_mask 2
+#define XEN_SYSCTL_TBUFOP_set_size     3
+#define XEN_SYSCTL_TBUFOP_enable       4
+#define XEN_SYSCTL_TBUFOP_disable      5
+    uint32_t cmd;
+    /* IN/OUT variables */
+    struct xenctl_cpumap cpu_mask;
+    uint32_t             evt_mask;
+    /* OUT variables */
+    uint64_aligned_t buffer_mfn;
+    uint32_t size;  /* Also an IN variable! */
+};
+typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t);
+
+/*
+ * Get physical information about the host machine
+ */
+/* XEN_SYSCTL_physinfo */
+ /* (x86) The platform supports HVM guests. */
+#define _XEN_SYSCTL_PHYSCAP_hvm          0
+#define XEN_SYSCTL_PHYSCAP_hvm           (1u<<_XEN_SYSCTL_PHYSCAP_hvm)
+ /* (x86) The platform supports HVM-guest direct access to I/O devices. */
+#define _XEN_SYSCTL_PHYSCAP_hvm_directio 1
+#define XEN_SYSCTL_PHYSCAP_hvm_directio  (1u<<_XEN_SYSCTL_PHYSCAP_hvm_directio)
+struct xen_sysctl_physinfo {
+    uint32_t threads_per_core;
+    uint32_t cores_per_socket;
+    uint32_t nr_cpus;     /* # CPUs currently online */
+    uint32_t max_cpu_id;  /* Largest possible CPU ID on this host */
+    uint32_t nr_nodes;    /* # nodes currently online */
+    uint32_t max_node_id; /* Largest possible node ID on this host */
+    uint32_t cpu_khz;
+    uint64_aligned_t total_pages;
+    uint64_aligned_t free_pages;
+    uint64_aligned_t scrub_pages;
+    uint32_t hw_cap[8];
+
+    /* XEN_SYSCTL_PHYSCAP_??? */
+    uint32_t capabilities;
+};
+typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t);
+
+/*
+ * Get the ID of the current scheduler.
+ */
+/* XEN_SYSCTL_sched_id */
+struct xen_sysctl_sched_id {
+    /* OUT variable */
+    uint32_t sched_id;
+};
+typedef struct xen_sysctl_sched_id xen_sysctl_sched_id_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_sched_id_t);
+
+/* Interface for controlling Xen software performance counters. */
+/* XEN_SYSCTL_perfc_op */
+/* Sub-operations: */
+#define XEN_SYSCTL_PERFCOP_reset 1   /* Reset all counters to zero. */
+#define XEN_SYSCTL_PERFCOP_query 2   /* Get perfctr information. */
+struct xen_sysctl_perfc_desc {
+    char         name[80];             /* name of perf counter */
+    uint32_t     nr_vals;              /* number of values for this counter */
+};
+typedef struct xen_sysctl_perfc_desc xen_sysctl_perfc_desc_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t);
+typedef uint32_t xen_sysctl_perfc_val_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t);
+
+struct xen_sysctl_perfc_op {
+    /* IN variables. */
+    uint32_t       cmd;                /*  XEN_SYSCTL_PERFCOP_??? */
+    /* OUT variables. */
+    uint32_t       nr_counters;       /*  number of counters description  */
+    uint32_t       nr_vals;           /*  number of values  */
+    /* counter information (or NULL) */
+    XEN_GUEST_HANDLE_64(xen_sysctl_perfc_desc_t) desc;
+    /* counter values (or NULL) */
+    XEN_GUEST_HANDLE_64(xen_sysctl_perfc_val_t) val;
+};
+typedef struct xen_sysctl_perfc_op xen_sysctl_perfc_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_op_t);
+
+/* XEN_SYSCTL_getdomaininfolist */
+struct xen_sysctl_getdomaininfolist {
+    /* IN variables. */
+    domid_t               first_domain;
+    uint32_t              max_domains;
+    XEN_GUEST_HANDLE_64(xen_domctl_getdomaininfo_t) buffer;
+    /* OUT variables. */
+    uint32_t              num_domains;
+};
+typedef struct xen_sysctl_getdomaininfolist xen_sysctl_getdomaininfolist_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomaininfolist_t);
+
+/* Inject debug keys into Xen. */
+/* XEN_SYSCTL_debug_keys */
+struct xen_sysctl_debug_keys {
+    /* IN variables. */
+    XEN_GUEST_HANDLE_64(char) keys;
+    uint32_t nr_keys;
+};
+typedef struct xen_sysctl_debug_keys xen_sysctl_debug_keys_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_debug_keys_t);
+
+/* Get physical CPU information. */
+/* XEN_SYSCTL_getcpuinfo */
+struct xen_sysctl_cpuinfo {
+    uint64_aligned_t idletime;
+};
+typedef struct xen_sysctl_cpuinfo xen_sysctl_cpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpuinfo_t); 
+struct xen_sysctl_getcpuinfo {
+    /* IN variables. */
+    uint32_t max_cpus;
+    XEN_GUEST_HANDLE_64(xen_sysctl_cpuinfo_t) info;
+    /* OUT variables. */
+    uint32_t nr_cpus;
+}; 
+typedef struct xen_sysctl_getcpuinfo xen_sysctl_getcpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getcpuinfo_t); 
+
+/* XEN_SYSCTL_availheap */
+struct xen_sysctl_availheap {
+    /* IN variables. */
+    uint32_t min_bitwidth;  /* Smallest address width (zero if don't care). */
+    uint32_t max_bitwidth;  /* Largest address width (zero if don't care). */
+    int32_t  node;          /* NUMA node of interest (-1 for all nodes). */
+    /* OUT variables. */
+    uint64_aligned_t avail_bytes;/* Bytes available in the specified region. */
+};
+typedef struct xen_sysctl_availheap xen_sysctl_availheap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_availheap_t);
+
+/* XEN_SYSCTL_get_pmstat */
+struct pm_px_val {
+    uint64_aligned_t freq;        /* Px core frequency */
+    uint64_aligned_t residency;   /* Px residency time */
+    uint64_aligned_t count;       /* Px transition count */
+};
+typedef struct pm_px_val pm_px_val_t;
+DEFINE_XEN_GUEST_HANDLE(pm_px_val_t);
+
+struct pm_px_stat {
+    uint8_t total;        /* total Px states */
+    uint8_t usable;       /* usable Px states */
+    uint8_t last;         /* last Px state */
+    uint8_t cur;          /* current Px state */
+    XEN_GUEST_HANDLE_64(uint64) trans_pt;   /* Px transition table */
+    XEN_GUEST_HANDLE_64(pm_px_val_t) pt;
+};
+typedef struct pm_px_stat pm_px_stat_t;
+DEFINE_XEN_GUEST_HANDLE(pm_px_stat_t);
+
+struct pm_cx_stat {
+    uint32_t nr;    /* entry nr in triggers & residencies, including C0 */
+    uint32_t last;  /* last Cx state */
+    uint64_aligned_t idle_time;                 /* idle time from boot */
+    XEN_GUEST_HANDLE_64(uint64) triggers;    /* Cx trigger counts */
+    XEN_GUEST_HANDLE_64(uint64) residencies; /* Cx residencies */
+    uint64_aligned_t pc2;
+    uint64_aligned_t pc3;
+    uint64_aligned_t pc6;
+    uint64_aligned_t pc7;
+    uint64_aligned_t cc3;
+    uint64_aligned_t cc6;
+    uint64_aligned_t cc7;
+};
+
+struct xen_sysctl_get_pmstat {
+#define PMSTAT_CATEGORY_MASK 0xf0
+#define PMSTAT_PX            0x10
+#define PMSTAT_CX            0x20
+#define PMSTAT_get_max_px    (PMSTAT_PX | 0x1)
+#define PMSTAT_get_pxstat    (PMSTAT_PX | 0x2)
+#define PMSTAT_reset_pxstat  (PMSTAT_PX | 0x3)
+#define PMSTAT_get_max_cx    (PMSTAT_CX | 0x1)
+#define PMSTAT_get_cxstat    (PMSTAT_CX | 0x2)
+#define PMSTAT_reset_cxstat  (PMSTAT_CX | 0x3)
+    uint32_t type;
+    uint32_t cpuid;
+    union {
+        struct pm_px_stat getpx;
+        struct pm_cx_stat getcx;
+        /* other struct for tx, etc */
+    } u;
+};
+typedef struct xen_sysctl_get_pmstat xen_sysctl_get_pmstat_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_get_pmstat_t);
+
+/* XEN_SYSCTL_cpu_hotplug */
+struct xen_sysctl_cpu_hotplug {
+    /* IN variables */
+    uint32_t cpu;   /* Physical cpu. */
+#define XEN_SYSCTL_CPU_HOTPLUG_ONLINE  0
+#define XEN_SYSCTL_CPU_HOTPLUG_OFFLINE 1
+    uint32_t op;    /* hotplug opcode */
+};
+typedef struct xen_sysctl_cpu_hotplug xen_sysctl_cpu_hotplug_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpu_hotplug_t);
+
+/*
+ * Get/set xen power management, include 
+ * 1. cpufreq governors and related parameters
+ */
+/* XEN_SYSCTL_pm_op */
+struct xen_userspace {
+    uint32_t scaling_setspeed;
+};
+typedef struct xen_userspace xen_userspace_t;
+
+struct xen_ondemand {
+    uint32_t sampling_rate_max;
+    uint32_t sampling_rate_min;
+
+    uint32_t sampling_rate;
+    uint32_t up_threshold;
+};
+typedef struct xen_ondemand xen_ondemand_t;
+
+/* 
+ * cpufreq para name of this structure named 
+ * same as sysfs file name of native linux
+ */
+#define CPUFREQ_NAME_LEN 16
+struct xen_get_cpufreq_para {
+    /* IN/OUT variable */
+    uint32_t cpu_num;
+    uint32_t freq_num;
+    uint32_t gov_num;
+
+    /* for all governors */
+    /* OUT variable */
+    XEN_GUEST_HANDLE_64(uint32) affected_cpus;
+    XEN_GUEST_HANDLE_64(uint32) scaling_available_frequencies;
+    XEN_GUEST_HANDLE_64(char)   scaling_available_governors;
+    char scaling_driver[CPUFREQ_NAME_LEN];
+
+    uint32_t cpuinfo_cur_freq;
+    uint32_t cpuinfo_max_freq;
+    uint32_t cpuinfo_min_freq;
+    uint32_t scaling_cur_freq;
+
+    char scaling_governor[CPUFREQ_NAME_LEN];
+    uint32_t scaling_max_freq;
+    uint32_t scaling_min_freq;
+
+    /* for specific governor */
+    union {
+        struct  xen_userspace userspace;
+        struct  xen_ondemand ondemand;
+    } u;
+
+    int32_t turbo_enabled;
+};
+
+struct xen_set_cpufreq_gov {
+    char scaling_governor[CPUFREQ_NAME_LEN];
+};
+
+struct xen_set_cpufreq_para {
+    #define SCALING_MAX_FREQ           1
+    #define SCALING_MIN_FREQ           2
+    #define SCALING_SETSPEED           3
+    #define SAMPLING_RATE              4
+    #define UP_THRESHOLD               5
+
+    uint32_t ctrl_type;
+    uint32_t ctrl_value;
+};
+
+struct xen_sysctl_pm_op {
+    #define PM_PARA_CATEGORY_MASK      0xf0
+    #define CPUFREQ_PARA               0x10
+
+    /* cpufreq command type */
+    #define GET_CPUFREQ_PARA           (CPUFREQ_PARA | 0x01)
+    #define SET_CPUFREQ_GOV            (CPUFREQ_PARA | 0x02)
+    #define SET_CPUFREQ_PARA           (CPUFREQ_PARA | 0x03)
+    #define GET_CPUFREQ_AVGFREQ        (CPUFREQ_PARA | 0x04)
+
+    /* set/reset scheduler power saving option */
+    #define XEN_SYSCTL_pm_op_set_sched_opt_smt    0x21
+
+    /* cpuidle max_cstate access command */
+    #define XEN_SYSCTL_pm_op_get_max_cstate       0x22
+    #define XEN_SYSCTL_pm_op_set_max_cstate       0x23
+
+    /* set scheduler migration cost value */
+    #define XEN_SYSCTL_pm_op_set_vcpu_migration_delay   0x24
+    #define XEN_SYSCTL_pm_op_get_vcpu_migration_delay   0x25
+
+    /* enable/disable turbo mode when in dbs governor */
+    #define XEN_SYSCTL_pm_op_enable_turbo               0x26
+    #define XEN_SYSCTL_pm_op_disable_turbo              0x27
+
+    uint32_t cmd;
+    uint32_t cpuid;
+    union {
+        struct xen_get_cpufreq_para get_para;
+        struct xen_set_cpufreq_gov  set_gov;
+        struct xen_set_cpufreq_para set_para;
+        uint64_aligned_t get_avgfreq;
+        uint32_t                    set_sched_opt_smt;
+        uint32_t                    get_max_cstate;
+        uint32_t                    set_max_cstate;
+        uint32_t                    get_vcpu_migration_delay;
+        uint32_t                    set_vcpu_migration_delay;
+    } u;
+};
+
+/* XEN_SYSCTL_page_offline_op */
+struct xen_sysctl_page_offline_op {
+    /* IN: range of page to be offlined */
+#define sysctl_page_offline     1
+#define sysctl_page_online      2
+#define sysctl_query_page_offline  3
+    uint32_t cmd;
+    uint32_t start;
+    uint32_t end;
+    /* OUT: result of page offline request */
+    /*
+     * bit 0~15: result flags
+     * bit 16~31: owner
+     */
+    XEN_GUEST_HANDLE(uint32) status;
+};
+
+#define PG_OFFLINE_STATUS_MASK    (0xFFUL)
+
+/* The result is invalid, i.e. HV does not handle it */
+#define PG_OFFLINE_INVALID   (0x1UL << 0)
+
+#define PG_OFFLINE_OFFLINED  (0x1UL << 1)
+#define PG_OFFLINE_PENDING   (0x1UL << 2)
+#define PG_OFFLINE_FAILED    (0x1UL << 3)
+#define PG_OFFLINE_AGAIN     (0x1UL << 4)
+
+#define PG_ONLINE_FAILED     PG_OFFLINE_FAILED
+#define PG_ONLINE_ONLINED    PG_OFFLINE_OFFLINED
+
+#define PG_OFFLINE_STATUS_OFFLINED              (0x1UL << 1)
+#define PG_OFFLINE_STATUS_ONLINE                (0x1UL << 2)
+#define PG_OFFLINE_STATUS_OFFLINE_PENDING       (0x1UL << 3)
+#define PG_OFFLINE_STATUS_BROKEN                (0x1UL << 4)
+
+#define PG_OFFLINE_MISC_MASK    (0xFFUL << 4)
+
+/* valid when PG_OFFLINE_FAILED or PG_OFFLINE_PENDING */
+#define PG_OFFLINE_XENPAGE   (0x1UL << 8)
+#define PG_OFFLINE_DOM0PAGE  (0x1UL << 9)
+#define PG_OFFLINE_ANONYMOUS (0x1UL << 10)
+#define PG_OFFLINE_NOT_CONV_RAM   (0x1UL << 11)
+#define PG_OFFLINE_OWNED     (0x1UL << 12)
+
+#define PG_OFFLINE_BROKEN    (0x1UL << 13)
+#define PG_ONLINE_BROKEN     PG_OFFLINE_BROKEN
+
+#define PG_OFFLINE_OWNER_SHIFT 16
+
+/* XEN_SYSCTL_lockprof_op */
+/* Sub-operations: */
+#define XEN_SYSCTL_LOCKPROF_reset 1   /* Reset all profile data to zero. */
+#define XEN_SYSCTL_LOCKPROF_query 2   /* Get lock profile information. */
+/* Record-type: */
+#define LOCKPROF_TYPE_GLOBAL      0   /* global lock, idx meaningless */
+#define LOCKPROF_TYPE_PERDOM      1   /* per-domain lock, idx is domid */
+#define LOCKPROF_TYPE_N           2   /* number of types */
+struct xen_sysctl_lockprof_data {
+    char     name[40];     /* lock name (may include up to 2 %d specifiers) */
+    int32_t  type;         /* LOCKPROF_TYPE_??? */
+    int32_t  idx;          /* index (e.g. domain id) */
+    uint64_aligned_t lock_cnt;     /* # of locking succeeded */
+    uint64_aligned_t block_cnt;    /* # of wait for lock */
+    uint64_aligned_t lock_time;    /* nsecs lock held */
+    uint64_aligned_t block_time;   /* nsecs waited for lock */
+};
+typedef struct xen_sysctl_lockprof_data xen_sysctl_lockprof_data_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_data_t);
+struct xen_sysctl_lockprof_op {
+    /* IN variables. */
+    uint32_t       cmd;               /* XEN_SYSCTL_LOCKPROF_??? */
+    uint32_t       max_elem;          /* size of output buffer */
+    /* OUT variables (query only). */
+    uint32_t       nr_elem;           /* number of elements available */
+    uint64_aligned_t time;            /* nsecs of profile measurement */
+    /* profile information (or NULL) */
+    XEN_GUEST_HANDLE_64(xen_sysctl_lockprof_data_t) data;
+};
+typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t);
+
+/* XEN_SYSCTL_topologyinfo */
+#define INVALID_TOPOLOGY_ID  (~0U)
+struct xen_sysctl_topologyinfo {
+    /*
+     * IN: maximum addressable entry in the caller-provided arrays.
+     * OUT: largest cpu identifier in the system.
+     * If OUT is greater than IN then the arrays are truncated!
+     * If OUT is leass than IN then the array tails are not written by sysctl.
+     */
+    uint32_t max_cpu_index;
+
+    /*
+     * If not NULL, these arrays are filled with core/socket/node identifier
+     * for each cpu.
+     * If a cpu has no core/socket/node information (e.g., cpu not present) 
+     * then the sentinel value ~0u is written to each array.
+     * The number of array elements written by the sysctl is:
+     *   min(@max_cpu_index_IN,@max_cpu_index_OUT)+1
+     */
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_core;
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_socket;
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_node;
+};
+typedef struct xen_sysctl_topologyinfo xen_sysctl_topologyinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_topologyinfo_t);
+
+/* XEN_SYSCTL_numainfo */
+struct xen_sysctl_numainfo {
+    /*
+     * IN: maximum addressable entry in the caller-provided arrays.
+     * OUT: largest node identifier in the system.
+     * If OUT is greater than IN then the arrays are truncated!
+     */
+    uint32_t max_node_index;
+
+    /* NB. Entries are 0 if node is not present. */
+    XEN_GUEST_HANDLE_64(uint64) node_to_memsize;
+    XEN_GUEST_HANDLE_64(uint64) node_to_memfree;
+
+    /*
+     * Array, of size (max_node_index+1)^2, listing memory access distances
+     * between nodes. If an entry has no node distance information (e.g., node 
+     * not present) then the value ~0u is written.
+     * 
+     * Note that the array rows must be indexed by multiplying by the minimum 
+     * of the caller-provided max_node_index and the returned value of
+     * max_node_index. That is, if the largest node index in the system is
+     * smaller than the caller can handle, a smaller 2-d array is constructed
+     * within the space provided by the caller. When this occurs, trailing
+     * space provided by the caller is not modified. If the largest node index
+     * in the system is larger than the caller can handle, then a 2-d array of
+     * the maximum size handleable by the caller is constructed.
+     */
+    XEN_GUEST_HANDLE_64(uint32) node_to_node_distance;
+};
+typedef struct xen_sysctl_numainfo xen_sysctl_numainfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_numainfo_t);
+
+/* XEN_SYSCTL_cpupool_op */
+#define XEN_SYSCTL_CPUPOOL_OP_CREATE                1  /* C */
+#define XEN_SYSCTL_CPUPOOL_OP_DESTROY               2  /* D */
+#define XEN_SYSCTL_CPUPOOL_OP_INFO                  3  /* I */
+#define XEN_SYSCTL_CPUPOOL_OP_ADDCPU                4  /* A */
+#define XEN_SYSCTL_CPUPOOL_OP_RMCPU                 5  /* R */
+#define XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN            6  /* M */
+#define XEN_SYSCTL_CPUPOOL_OP_FREEINFO              7  /* F */
+#define XEN_SYSCTL_CPUPOOL_PAR_ANY     0xFFFFFFFF
+struct xen_sysctl_cpupool_op {
+    uint32_t op;          /* IN */
+    uint32_t cpupool_id;  /* IN: CDIARM OUT: CI */
+    uint32_t sched_id;    /* IN: C      OUT: I  */
+    uint32_t domid;       /* IN: M              */
+    uint32_t cpu;         /* IN: AR             */
+    uint32_t n_dom;       /*            OUT: I  */
+    struct xenctl_cpumap cpumap; /*     OUT: IF */
+};
+typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t);
+
+#define ARINC653_MAX_DOMAINS_PER_SCHEDULE   64
+/*
+ * This structure is used to pass a new ARINC653 schedule from a
+ * privileged domain (ie dom0) to Xen.
+ */
+struct xen_sysctl_arinc653_schedule {
+    /* major_frame holds the time for the new schedule's major frame
+     * in nanoseconds. */
+    uint64_aligned_t     major_frame;
+    /* num_sched_entries holds how many of the entries in the
+     * sched_entries[] array are valid. */
+    uint8_t     num_sched_entries;
+    /* The sched_entries array holds the actual schedule entries. */
+    struct {
+        /* dom_handle must match a domain's UUID */
+        xen_domain_handle_t dom_handle;
+        /* If a domain has multiple VCPUs, vcpu_id specifies which one
+         * this schedule entry applies to. It should be set to 0 if
+         * there is only one VCPU for the domain. */
+        unsigned int vcpu_id;
+        /* runtime specifies the amount of time that should be allocated
+         * to this VCPU per major frame. It is specified in nanoseconds */
+        uint64_aligned_t runtime;
+    } sched_entries[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
+};
+typedef struct xen_sysctl_arinc653_schedule xen_sysctl_arinc653_schedule_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_arinc653_schedule_t);
+
+/* XEN_SYSCTL_scheduler_op */
+/* Set or get info? */
+#define XEN_SYSCTL_SCHEDOP_putinfo 0
+#define XEN_SYSCTL_SCHEDOP_getinfo 1
+struct xen_sysctl_scheduler_op {
+    uint32_t cpupool_id; /* Cpupool whose scheduler is to be targetted. */
+    uint32_t sched_id;   /* XEN_SCHEDULER_* (domctl.h) */
+    uint32_t cmd;        /* XEN_SYSCTL_SCHEDOP_* */
+    union {
+        struct xen_sysctl_sched_arinc653 {
+            XEN_GUEST_HANDLE_64(xen_sysctl_arinc653_schedule_t) schedule;
+        } sched_arinc653;
+    } u;
+};
+typedef struct xen_sysctl_scheduler_op xen_sysctl_scheduler_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_scheduler_op_t);
+
+struct xen_sysctl {
+    uint32_t cmd;
+#define XEN_SYSCTL_readconsole                    1
+#define XEN_SYSCTL_tbuf_op                        2
+#define XEN_SYSCTL_physinfo                       3
+#define XEN_SYSCTL_sched_id                       4
+#define XEN_SYSCTL_perfc_op                       5
+#define XEN_SYSCTL_getdomaininfolist              6
+#define XEN_SYSCTL_debug_keys                     7
+#define XEN_SYSCTL_getcpuinfo                     8
+#define XEN_SYSCTL_availheap                      9
+#define XEN_SYSCTL_get_pmstat                    10
+#define XEN_SYSCTL_cpu_hotplug                   11
+#define XEN_SYSCTL_pm_op                         12
+#define XEN_SYSCTL_page_offline_op               14
+#define XEN_SYSCTL_lockprof_op                   15
+#define XEN_SYSCTL_topologyinfo                  16 
+#define XEN_SYSCTL_numainfo                      17
+#define XEN_SYSCTL_cpupool_op                    18
+#define XEN_SYSCTL_scheduler_op                  19
+    uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
+    union {
+        struct xen_sysctl_readconsole       readconsole;
+        struct xen_sysctl_tbuf_op           tbuf_op;
+        struct xen_sysctl_physinfo          physinfo;
+        struct xen_sysctl_topologyinfo      topologyinfo;
+        struct xen_sysctl_numainfo          numainfo;
+        struct xen_sysctl_sched_id          sched_id;
+        struct xen_sysctl_perfc_op          perfc_op;
+        struct xen_sysctl_getdomaininfolist getdomaininfolist;
+        struct xen_sysctl_debug_keys        debug_keys;
+        struct xen_sysctl_getcpuinfo        getcpuinfo;
+        struct xen_sysctl_availheap         availheap;
+        struct xen_sysctl_get_pmstat        get_pmstat;
+        struct xen_sysctl_cpu_hotplug       cpu_hotplug;
+        struct xen_sysctl_pm_op             pm_op;
+        struct xen_sysctl_page_offline_op   page_offline;
+        struct xen_sysctl_lockprof_op       lockprof_op;
+        struct xen_sysctl_cpupool_op        cpupool_op;
+        struct xen_sysctl_scheduler_op      scheduler_op;
+        uint8_t                             pad[128];
+    } u;
+};
+typedef struct xen_sysctl xen_sysctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_t);
+
+#endif /* __XEN_PUBLIC_SYSCTL_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/tmem.h b/include/xen/interface/tmem.h
new file mode 100644
index 0000000..74bd1c6
--- /dev/null
+++ b/include/xen/interface/tmem.h
@@ -0,0 +1,148 @@
+/******************************************************************************
+ * tmem.h
+ * 
+ * Guest OS interface to Xen Transcendent Memory.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_TMEM_H__
+#define __XEN_PUBLIC_TMEM_H__
+
+#include "xen.h"
+
+/* version of ABI */
+#define TMEM_SPEC_VERSION          1
+
+/* Commands to HYPERVISOR_tmem_op() */
+#define TMEM_CONTROL               0
+#define TMEM_NEW_POOL              1
+#define TMEM_DESTROY_POOL          2
+#define TMEM_NEW_PAGE              3
+#define TMEM_PUT_PAGE              4
+#define TMEM_GET_PAGE              5
+#define TMEM_FLUSH_PAGE            6
+#define TMEM_FLUSH_OBJECT          7
+#define TMEM_READ                  8
+#define TMEM_WRITE                 9
+#define TMEM_XCHG                 10
+
+/* Privileged commands to HYPERVISOR_tmem_op() */
+#define TMEM_AUTH                 101 
+#define TMEM_RESTORE_NEW          102
+
+/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */
+#define TMEMC_THAW                   0
+#define TMEMC_FREEZE                 1
+#define TMEMC_FLUSH                  2
+#define TMEMC_DESTROY                3
+#define TMEMC_LIST                   4
+#define TMEMC_SET_WEIGHT             5
+#define TMEMC_SET_CAP                6
+#define TMEMC_SET_COMPRESS           7
+#define TMEMC_QUERY_FREEABLE_MB      8
+#define TMEMC_SAVE_BEGIN             10
+#define TMEMC_SAVE_GET_VERSION       11
+#define TMEMC_SAVE_GET_MAXPOOLS      12
+#define TMEMC_SAVE_GET_CLIENT_WEIGHT 13
+#define TMEMC_SAVE_GET_CLIENT_CAP    14
+#define TMEMC_SAVE_GET_CLIENT_FLAGS  15
+#define TMEMC_SAVE_GET_POOL_FLAGS    16
+#define TMEMC_SAVE_GET_POOL_NPAGES   17
+#define TMEMC_SAVE_GET_POOL_UUID     18
+#define TMEMC_SAVE_GET_NEXT_PAGE     19
+#define TMEMC_SAVE_GET_NEXT_INV      20
+#define TMEMC_SAVE_END               21
+#define TMEMC_RESTORE_BEGIN          30
+#define TMEMC_RESTORE_PUT_PAGE       32
+#define TMEMC_RESTORE_FLUSH_PAGE     33
+
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_PERSIST          1
+#define TMEM_POOL_SHARED           2
+#define TMEM_POOL_PRECOMPRESSED    4
+#define TMEM_POOL_PAGESIZE_SHIFT   4
+#define TMEM_POOL_PAGESIZE_MASK  0xf
+#define TMEM_POOL_VERSION_SHIFT   24
+#define TMEM_POOL_VERSION_MASK  0xff
+#define TMEM_POOL_RESERVED_BITS  0x00ffff00
+
+/* Bits for client flags (save/restore) */
+#define TMEM_CLIENT_COMPRESS       1
+#define TMEM_CLIENT_FROZEN         2
+
+/* Special errno values */
+#define EFROZEN                 1000
+#define EEMPTY                  1001
+
+
+#ifndef __ASSEMBLY__
+typedef xen_pfn_t tmem_cli_mfn_t;
+typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t;
+struct tmem_op {
+    uint32_t cmd;
+    int32_t pool_id;
+    union {
+        struct {
+            uint64_t uuid[2];
+            uint32_t flags;
+            uint32_t arg1;
+        } creat; /* for cmd == TMEM_NEW_POOL, TMEM_AUTH, TMEM_RESTORE_NEW */
+        struct { 
+            uint32_t subop;
+            uint32_t cli_id;
+            uint32_t arg1;
+            uint32_t arg2;
+            uint64_t oid[3];
+            tmem_cli_va_t buf;
+        } ctrl; /* for cmd == TMEM_CONTROL */
+        struct {
+            
+            uint64_t oid[3];
+            uint32_t index;
+            uint32_t tmem_offset;
+            uint32_t pfn_offset;
+            uint32_t len;
+            tmem_cli_mfn_t cmfn; /* client machine page frame */
+        } gen; /* for all other cmd ("generic") */
+    } u;
+};
+typedef struct tmem_op tmem_op_t;
+DEFINE_XEN_GUEST_HANDLE(tmem_op_t);
+
+struct tmem_handle {
+    uint32_t pool_id;
+    uint32_t index;
+    uint64_t oid[3];
+};
+#endif
+
+#endif /* __XEN_PUBLIC_TMEM_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/trace.h b/include/xen/interface/trace.h
new file mode 100644
index 0000000..0dfabe9
--- /dev/null
+++ b/include/xen/interface/trace.h
@@ -0,0 +1,245 @@
+/******************************************************************************
+ * include/public/trace.h
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Mark Williamson, (C) 2004 Intel Research Cambridge
+ * Copyright (C) 2005 Bin Ren
+ */
+
+#ifndef __XEN_PUBLIC_TRACE_H__
+#define __XEN_PUBLIC_TRACE_H__
+
+#define TRACE_EXTRA_MAX    7
+#define TRACE_EXTRA_SHIFT 28
+
+/* Trace classes */
+#define TRC_CLS_SHIFT 16
+#define TRC_GEN      0x0001f000    /* General trace            */
+#define TRC_SCHED    0x0002f000    /* Xen Scheduler trace      */
+#define TRC_DOM0OP   0x0004f000    /* Xen DOM0 operation trace */
+#define TRC_HVM      0x0008f000    /* Xen HVM trace            */
+#define TRC_MEM      0x0010f000    /* Xen memory trace         */
+#define TRC_PV       0x0020f000    /* Xen PV traces            */
+#define TRC_SHADOW   0x0040f000    /* Xen shadow tracing       */
+#define TRC_HW       0x0080f000    /* Xen hardware-related traces */
+#define TRC_GUEST    0x0800f000    /* Guest-generated traces   */
+#define TRC_ALL      0x0ffff000
+#define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff)
+#define TRC_HD_CYCLE_FLAG (1UL<<31)
+#define TRC_HD_INCLUDES_CYCLE_COUNT(x) ( !!( (x) & TRC_HD_CYCLE_FLAG ) )
+#define TRC_HD_EXTRA(x)    (((x)>>TRACE_EXTRA_SHIFT)&TRACE_EXTRA_MAX)
+
+/* Trace subclasses */
+#define TRC_SUBCLS_SHIFT 12
+
+/* trace subclasses for SVM */
+#define TRC_HVM_ENTRYEXIT 0x00081000   /* VMENTRY and #VMEXIT       */
+#define TRC_HVM_HANDLER   0x00082000   /* various HVM handlers      */
+
+#define TRC_SCHED_MIN       0x00021000   /* Just runstate changes */
+#define TRC_SCHED_CLASS     0x00022000   /* Scheduler-specific    */
+#define TRC_SCHED_VERBOSE   0x00028000   /* More inclusive scheduling */
+
+/* Trace classes for Hardware */
+#define TRC_HW_PM           0x00801000   /* Power management traces */
+#define TRC_HW_IRQ          0x00802000   /* Traces relating to the handling of IRQs */
+
+/* Trace events per class */
+#define TRC_LOST_RECORDS        (TRC_GEN + 1)
+#define TRC_TRACE_WRAP_BUFFER  (TRC_GEN + 2)
+#define TRC_TRACE_CPU_CHANGE    (TRC_GEN + 3)
+
+#define TRC_SCHED_RUNSTATE_CHANGE   (TRC_SCHED_MIN + 1)
+#define TRC_SCHED_CONTINUE_RUNNING  (TRC_SCHED_MIN + 2)
+#define TRC_SCHED_DOM_ADD        (TRC_SCHED_VERBOSE +  1)
+#define TRC_SCHED_DOM_REM        (TRC_SCHED_VERBOSE +  2)
+#define TRC_SCHED_SLEEP          (TRC_SCHED_VERBOSE +  3)
+#define TRC_SCHED_WAKE           (TRC_SCHED_VERBOSE +  4)
+#define TRC_SCHED_YIELD          (TRC_SCHED_VERBOSE +  5)
+#define TRC_SCHED_BLOCK          (TRC_SCHED_VERBOSE +  6)
+#define TRC_SCHED_SHUTDOWN       (TRC_SCHED_VERBOSE +  7)
+#define TRC_SCHED_CTL            (TRC_SCHED_VERBOSE +  8)
+#define TRC_SCHED_ADJDOM         (TRC_SCHED_VERBOSE +  9)
+#define TRC_SCHED_SWITCH         (TRC_SCHED_VERBOSE + 10)
+#define TRC_SCHED_S_TIMER_FN     (TRC_SCHED_VERBOSE + 11)
+#define TRC_SCHED_T_TIMER_FN     (TRC_SCHED_VERBOSE + 12)
+#define TRC_SCHED_DOM_TIMER_FN   (TRC_SCHED_VERBOSE + 13)
+#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED_VERBOSE + 14)
+#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED_VERBOSE + 15)
+#define TRC_SCHED_SHUTDOWN_CODE  (TRC_SCHED_VERBOSE + 16)
+
+#define TRC_MEM_PAGE_GRANT_MAP      (TRC_MEM + 1)
+#define TRC_MEM_PAGE_GRANT_UNMAP    (TRC_MEM + 2)
+#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
+#define TRC_MEM_SET_P2M_ENTRY       (TRC_MEM + 4)
+#define TRC_MEM_DECREASE_RESERVATION (TRC_MEM + 5)
+#define TRC_MEM_POD_POPULATE        (TRC_MEM + 16)
+#define TRC_MEM_POD_ZERO_RECLAIM    (TRC_MEM + 17)
+#define TRC_MEM_POD_SUPERPAGE_SPLINTER (TRC_MEM + 18)
+
+
+#define TRC_PV_HYPERCALL             (TRC_PV +  1)
+#define TRC_PV_TRAP                  (TRC_PV +  3)
+#define TRC_PV_PAGE_FAULT            (TRC_PV +  4)
+#define TRC_PV_FORCED_INVALID_OP     (TRC_PV +  5)
+#define TRC_PV_EMULATE_PRIVOP        (TRC_PV +  6)
+#define TRC_PV_EMULATE_4GB           (TRC_PV +  7)
+#define TRC_PV_MATH_STATE_RESTORE    (TRC_PV +  8)
+#define TRC_PV_PAGING_FIXUP          (TRC_PV +  9)
+#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV + 10)
+#define TRC_PV_PTWR_EMULATION        (TRC_PV + 11)
+#define TRC_PV_PTWR_EMULATION_PAE    (TRC_PV + 12)
+  /* Indicates that addresses in trace record are 64 bits */
+#define TRC_64_FLAG               (0x100) 
+
+#define TRC_SHADOW_NOT_SHADOW                 (TRC_SHADOW +  1)
+#define TRC_SHADOW_FAST_PROPAGATE             (TRC_SHADOW +  2)
+#define TRC_SHADOW_FAST_MMIO                  (TRC_SHADOW +  3)
+#define TRC_SHADOW_FALSE_FAST_PATH            (TRC_SHADOW +  4)
+#define TRC_SHADOW_MMIO                       (TRC_SHADOW +  5)
+#define TRC_SHADOW_FIXUP                      (TRC_SHADOW +  6)
+#define TRC_SHADOW_DOMF_DYING                 (TRC_SHADOW +  7)
+#define TRC_SHADOW_EMULATE                    (TRC_SHADOW +  8)
+#define TRC_SHADOW_EMULATE_UNSHADOW_USER      (TRC_SHADOW +  9)
+#define TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ    (TRC_SHADOW + 10)
+#define TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED (TRC_SHADOW + 11)
+#define TRC_SHADOW_WRMAP_BF                   (TRC_SHADOW + 12)
+#define TRC_SHADOW_PREALLOC_UNPIN             (TRC_SHADOW + 13)
+#define TRC_SHADOW_RESYNC_FULL                (TRC_SHADOW + 14)
+#define TRC_SHADOW_RESYNC_ONLY                (TRC_SHADOW + 15)
+
+/* trace events per subclass */
+#define TRC_HVM_NESTEDFLAG      (0x400)
+#define TRC_HVM_VMENTRY         (TRC_HVM_ENTRYEXIT + 0x01)
+#define TRC_HVM_VMEXIT          (TRC_HVM_ENTRYEXIT + 0x02)
+#define TRC_HVM_VMEXIT64        (TRC_HVM_ENTRYEXIT + TRC_64_FLAG + 0x02)
+#define TRC_HVM_PF_XEN          (TRC_HVM_HANDLER + 0x01)
+#define TRC_HVM_PF_XEN64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x01)
+#define TRC_HVM_PF_INJECT       (TRC_HVM_HANDLER + 0x02)
+#define TRC_HVM_PF_INJECT64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x02)
+#define TRC_HVM_INJ_EXC         (TRC_HVM_HANDLER + 0x03)
+#define TRC_HVM_INJ_VIRQ        (TRC_HVM_HANDLER + 0x04)
+#define TRC_HVM_REINJ_VIRQ      (TRC_HVM_HANDLER + 0x05)
+#define TRC_HVM_IO_READ         (TRC_HVM_HANDLER + 0x06)
+#define TRC_HVM_IO_WRITE        (TRC_HVM_HANDLER + 0x07)
+#define TRC_HVM_CR_READ         (TRC_HVM_HANDLER + 0x08)
+#define TRC_HVM_CR_READ64       (TRC_HVM_HANDLER + TRC_64_FLAG + 0x08)
+#define TRC_HVM_CR_WRITE        (TRC_HVM_HANDLER + 0x09)
+#define TRC_HVM_CR_WRITE64      (TRC_HVM_HANDLER + TRC_64_FLAG + 0x09)
+#define TRC_HVM_DR_READ         (TRC_HVM_HANDLER + 0x0A)
+#define TRC_HVM_DR_WRITE        (TRC_HVM_HANDLER + 0x0B)
+#define TRC_HVM_MSR_READ        (TRC_HVM_HANDLER + 0x0C)
+#define TRC_HVM_MSR_WRITE       (TRC_HVM_HANDLER + 0x0D)
+#define TRC_HVM_CPUID           (TRC_HVM_HANDLER + 0x0E)
+#define TRC_HVM_INTR            (TRC_HVM_HANDLER + 0x0F)
+#define TRC_HVM_NMI             (TRC_HVM_HANDLER + 0x10)
+#define TRC_HVM_SMI             (TRC_HVM_HANDLER + 0x11)
+#define TRC_HVM_VMMCALL         (TRC_HVM_HANDLER + 0x12)
+#define TRC_HVM_HLT             (TRC_HVM_HANDLER + 0x13)
+#define TRC_HVM_INVLPG          (TRC_HVM_HANDLER + 0x14)
+#define TRC_HVM_INVLPG64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14)
+#define TRC_HVM_MCE             (TRC_HVM_HANDLER + 0x15)
+#define TRC_HVM_IOPORT_READ     (TRC_HVM_HANDLER + 0x16)
+#define TRC_HVM_IOMEM_READ      (TRC_HVM_HANDLER + 0x17)
+#define TRC_HVM_CLTS            (TRC_HVM_HANDLER + 0x18)
+#define TRC_HVM_LMSW            (TRC_HVM_HANDLER + 0x19)
+#define TRC_HVM_LMSW64          (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19)
+#define TRC_HVM_RDTSC           (TRC_HVM_HANDLER + 0x1a)
+#define TRC_HVM_INTR_WINDOW     (TRC_HVM_HANDLER + 0x20)
+#define TRC_HVM_NPF             (TRC_HVM_HANDLER + 0x21)
+#define TRC_HVM_REALMODE_EMULATE (TRC_HVM_HANDLER + 0x22)
+#define TRC_HVM_TRAP             (TRC_HVM_HANDLER + 0x23)
+#define TRC_HVM_TRAP_DEBUG       (TRC_HVM_HANDLER + 0x24)
+#define TRC_HVM_VLAPIC           (TRC_HVM_HANDLER + 0x25)
+
+#define TRC_HVM_IOPORT_WRITE    (TRC_HVM_HANDLER + 0x216)
+#define TRC_HVM_IOMEM_WRITE     (TRC_HVM_HANDLER + 0x217)
+
+/* trace events for per class */
+#define TRC_PM_FREQ_CHANGE      (TRC_HW_PM + 0x01)
+#define TRC_PM_IDLE_ENTRY       (TRC_HW_PM + 0x02)
+#define TRC_PM_IDLE_EXIT        (TRC_HW_PM + 0x03)
+
+/* Trace events for IRQs */
+#define TRC_HW_IRQ_MOVE_CLEANUP_DELAY (TRC_HW_IRQ + 0x1)
+#define TRC_HW_IRQ_MOVE_CLEANUP       (TRC_HW_IRQ + 0x2)
+#define TRC_HW_IRQ_BIND_VECTOR        (TRC_HW_IRQ + 0x3)
+#define TRC_HW_IRQ_CLEAR_VECTOR       (TRC_HW_IRQ + 0x4)
+#define TRC_HW_IRQ_MOVE_FINISH        (TRC_HW_IRQ + 0x5)
+#define TRC_HW_IRQ_ASSIGN_VECTOR      (TRC_HW_IRQ + 0x6)
+#define TRC_HW_IRQ_UNMAPPED_VECTOR    (TRC_HW_IRQ + 0x7)
+#define TRC_HW_IRQ_HANDLED            (TRC_HW_IRQ + 0x8)
+
+
+/* This structure represents a single trace buffer record. */
+struct t_rec {
+    uint32_t event:28;
+    uint32_t extra_u32:3;         /* # entries in trailing extra_u32[] array */
+    uint32_t cycles_included:1;   /* u.cycles or u.no_cycles? */
+    union {
+        struct {
+            uint32_t cycles_lo, cycles_hi; /* cycle counter timestamp */
+            uint32_t extra_u32[7];         /* event data items */
+        } cycles;
+        struct {
+            uint32_t extra_u32[7];         /* event data items */
+        } nocycles;
+    } u;
+};
+
+/*
+ * This structure contains the metadata for a single trace buffer.  The head
+ * field, indexes into an array of struct t_rec's.
+ */
+struct t_buf {
+    /* Assume the data buffer size is X.  X is generally not a power of 2.
+     * CONS and PROD are incremented modulo (2*X):
+     *     0 <= cons < 2*X
+     *     0 <= prod < 2*X
+     * This is done because addition modulo X breaks at 2^32 when X is not a
+     * power of 2:
+     *     (((2^32 - 1) % X) + 1) % X != (2^32) % X
+     */
+    uint32_t cons;   /* Offset of next item to be consumed by control tools. */
+    uint32_t prod;   /* Offset of next item to be produced by Xen.           */
+    /*  Records follow immediately after the meta-data header.    */
+};
+
+/* Structure used to pass MFNs to the trace buffers back to trace consumers.
+ * Offset is an offset into the mapped structure where the mfn list will be held.
+ * MFNs will be at ((unsigned long *)(t_info))+(t_info->cpu_offset[cpu]).
+ */
+struct t_info {
+    uint16_t tbuf_size; /* Size in pages of each trace buffer */
+    uint16_t mfn_offset[];  /* Offset within t_info structure of the page list per cpu */
+    /* MFN lists immediately after the header */
+};
+
+#endif /* __XEN_PUBLIC_TRACE_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h
index 87e6f8a..768b5ff 100644
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -27,6 +27,8 @@
 #ifndef __XEN_PUBLIC_VCPU_H__
 #define __XEN_PUBLIC_VCPU_H__
 
+#include "xen.h"
+
 /*
  * Prototype for this hypercall is:
  *	int vcpu_op(int cmd, int vcpuid, void *extra_args)
@@ -86,6 +88,8 @@ struct vcpu_runstate_info {
 		uint64_t time[4];
 };
 DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
+typedef struct vcpu_runstate_info vcpu_runstate_info_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
 
 /* VCPU is currently running on a physical CPU. */
 #define RUNSTATE_running  0
@@ -120,11 +124,13 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
 #define VCPUOP_register_runstate_memory_area 5
 struct vcpu_register_runstate_memory_area {
 		union {
-				GUEST_HANDLE(vcpu_runstate_info) h;
+				XEN_GUEST_HANDLE(vcpu_runstate_info_t) h;
 				struct vcpu_runstate_info *v;
 				uint64_t p;
 		} addr;
 };
+typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_register_runstate_memory_area_t);
 
 /*
  * Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer
@@ -137,6 +143,8 @@ struct vcpu_set_periodic_timer {
 		uint64_t period_ns;
 };
 DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
+typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t);
 
 /*
  * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot
@@ -149,6 +157,8 @@ struct vcpu_set_singleshot_timer {
 		uint32_t flags;			   /* VCPU_SSHOTTMR_??? */
 };
 DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
+typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t);
 
 /* Flags to VCPUOP_set_singleshot_timer. */
  /* Require the timeout to be in the future (return -ETIME if it's passed). */
@@ -161,6 +171,8 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
  * structure in a convenient place, such as in a per-cpu data area.
  * The pointer need not be page aligned, but the structure must not
  * cross a page boundary.
+ *
+ * This may be called only once per vcpu.
  */
 #define VCPUOP_register_vcpu_info   10  /* arg == struct vcpu_info */
 struct vcpu_register_vcpu_info {
@@ -169,5 +181,53 @@ struct vcpu_register_vcpu_info {
     uint32_t rsvd;   /* unused */
 };
 DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
+typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);
+
+/* Send an NMI to the specified VCPU. @extra_arg == NULL. */
+#define VCPUOP_send_nmi             11
+
+/*
+ * Get the physical ID information for a pinned vcpu's underlying physical
+ * processor.  The physical ID informmation is architecture-specific.
+ * On x86: id[31:0]=apic_id, id[63:32]=acpi_id.
+ * This command returns -EINVAL if it is not a valid operation for this VCPU.
+ */
+#define VCPUOP_get_physid           12 /* arg == vcpu_get_physid_t */
+struct vcpu_get_physid {
+    uint64_t phys_id;
+};
+typedef struct vcpu_get_physid vcpu_get_physid_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_get_physid_t);
+#define xen_vcpu_physid_to_x86_apicid(physid) ((uint32_t)(physid))
+#define xen_vcpu_physid_to_x86_acpiid(physid) ((uint32_t)((physid) >> 32))
+
+/*
+ * Register a memory location to get a secondary copy of the vcpu time
+ * parameters.  The master copy still exists as part of the vcpu shared
+ * memory area, and this secondary copy is updated whenever the master copy
+ * is updated (and using the same versioning scheme for synchronisation).
+ *
+ * The intent is that this copy may be mapped (RO) into userspace so
+ * that usermode can compute system time using the time info and the
+ * tsc.  Usermode will see an array of vcpu_time_info structures, one
+ * for each vcpu, and choose the right one by an existing mechanism
+ * which allows it to get the current vcpu number (such as via a
+ * segment limit).  It can then apply the normal algorithm to compute
+ * system time from the tsc.
+ *
+ * @extra_arg == pointer to vcpu_register_time_info_memory_area structure.
+ */
+#define VCPUOP_register_vcpu_time_memory_area   13
+DEFINE_XEN_GUEST_HANDLE(vcpu_time_info_t);
+struct vcpu_register_time_memory_area {
+    union {
+        XEN_GUEST_HANDLE(vcpu_time_info_t) h;
+        struct vcpu_time_info *v;
+        uint64_t p;
+    } addr;
+};
+typedef struct vcpu_register_time_memory_area vcpu_register_time_memory_area_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_register_time_memory_area_t);
 
 #endif /* __XEN_PUBLIC_VCPU_H__ */
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h
index e8b6519..da54fd8 100644
--- a/include/xen/interface/version.h
+++ b/include/xen/interface/version.h
@@ -3,6 +3,24 @@
  *
  * Xen version, type, and compile information.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
  * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
  */
@@ -10,17 +28,18 @@
 #ifndef __XEN_PUBLIC_VERSION_H__
 #define __XEN_PUBLIC_VERSION_H__
 
-/* NB. All ops return zero on success, except XENVER_version. */
+/* NB. All ops return zero on success, except XENVER_{version,pagesize} */
 
 /* arg == NULL; returns major:minor (16:16). */
 #define XENVER_version      0
 
 /* arg == xen_extraversion_t. */
 #define XENVER_extraversion 1
+typedef char xen_extraversion_t[16];
 struct xen_extraversion {
-    char extraversion[16];
+    xen_extraversion_t extraversion;
 };
-#define XEN_EXTRAVERSION_LEN (sizeof(struct xen_extraversion))
+#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
 
 /* arg == xen_compile_info_t. */
 #define XENVER_compile_info 2
@@ -30,29 +49,34 @@ struct xen_compile_info {
     char compile_domain[32];
     char compile_date[32];
 };
+typedef struct xen_compile_info xen_compile_info_t;
 
 #define XENVER_capabilities 3
+typedef char xen_capabilities_info_t[1024];
 struct xen_capabilities_info {
-    char info[1024];
+    xen_capabilities_info_t info;
 };
-#define XEN_CAPABILITIES_INFO_LEN (sizeof(struct xen_capabilities_info))
+#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
 
 #define XENVER_changeset 4
+typedef char xen_changeset_info_t[64];
 struct xen_changeset_info {
-    char info[64];
+    xen_changeset_info_t info;
 };
-#define XEN_CHANGESET_INFO_LEN (sizeof(struct xen_changeset_info))
+#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
 
 #define XENVER_platform_parameters 5
 struct xen_platform_parameters {
     unsigned long virt_start;
 };
+typedef struct xen_platform_parameters xen_platform_parameters_t;
 
 #define XENVER_get_features 6
 struct xen_feature_info {
     unsigned int submap_idx;    /* IN: which 32-bit submap to return */
     uint32_t     submap;        /* OUT: 32-bit submap */
 };
+typedef struct xen_feature_info xen_feature_info_t;
 
 /* Declares the features reported by XENVER_get_features. */
 #include "features.h"
@@ -60,4 +84,10 @@ struct xen_feature_info {
 /* arg == NULL; returns host memory page size. */
 #define XENVER_pagesize 7
 
+/* arg == xen_domain_handle_t. */
+#define XENVER_guest_handle 8
+
+#define XENVER_commandline 9
+typedef char xen_commandline_t[1024];
+
 #endif /* __XEN_PUBLIC_VERSION_H__ */
diff --git a/include/xen/interface/xen-compat.h b/include/xen/interface/xen-compat.h
new file mode 100644
index 0000000..d8c55bf
--- /dev/null
+++ b/include/xen/interface/xen-compat.h
@@ -0,0 +1,44 @@
+/******************************************************************************
+ * xen-compat.h
+ * 
+ * Guest OS interface to Xen.  Compatibility layer.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Christian Limpach
+ */
+
+#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
+#define __XEN_PUBLIC_XEN_COMPAT_H__
+
+#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040200
+
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+/* Xen is built with matching headers and implements the latest interface. */
+#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
+#elif !defined(__XEN_INTERFACE_VERSION__)
+/* Guests which do not specify a version get the legacy interface. */
+#define __XEN_INTERFACE_VERSION__ 0x00000000
+#endif
+
+#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
+#error "These header files do not support the requested interface version."
+#endif
+
+#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index a890804..3cddfb8 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -3,35 +3,77 @@
  *
  * Guest OS interface to Xen.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2004, K A Fraser
  */
 
 #ifndef __XEN_PUBLIC_XEN_H__
 #define __XEN_PUBLIC_XEN_H__
 
-#include <asm/xen/interface.h>
+#include "xen-compat.h"
+#ifdef CONFIG_PARAVIRT_XEN
 #include <asm/pvclock-abi.h>
+#endif
+
+#if defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
+#include <asm/xen/interface.h>
+#elif defined(__i386__) || defined(__x86_64__)
+#include "arch-x86/xen.h"
+#elif defined(__ia64__)
+#include "arch-ia64.h"
+#elif defined(__arm__)
+#include "arch-arm.h"
+#else
+#error "Unsupported architecture"
+#endif
+
+#ifndef __ASSEMBLY__
+/* Guest handles for primitive C types. */
+DEFINE_XEN_GUEST_HANDLE(char);
+__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
+DEFINE_XEN_GUEST_HANDLE(int);
+__DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
+DEFINE_XEN_GUEST_HANDLE(long);
+__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
+DEFINE_XEN_GUEST_HANDLE(void);
+
+DEFINE_XEN_GUEST_HANDLE(uint64_t);
+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
+#endif
 
 /*
- * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
+ * HYPERCALLS
  */
 
-/*
- * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
- *         EAX = return value
- *         (argument registers may be clobbered on return)
- * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6.
- *         RAX = return value
- *         (argument registers not clobbered on return; RCX, R11 are)
+/* `incontents 100 hcalls List of hypercalls
+ * ` enum hypercall_num { // __HYPERVISOR_* => HYPERVISOR_*()
  */
+
 #define __HYPERVISOR_set_trap_table        0
 #define __HYPERVISOR_mmu_update            1
 #define __HYPERVISOR_set_gdt               2
 #define __HYPERVISOR_stack_switch          3
 #define __HYPERVISOR_set_callbacks         4
 #define __HYPERVISOR_fpu_taskswitch        5
-#define __HYPERVISOR_sched_op_compat       6
-#define __HYPERVISOR_dom0_op               7
+#define __HYPERVISOR_sched_op_compat       6 /* compat since 0x00030101 */
+#define __HYPERVISOR_platform_op           7
 #define __HYPERVISOR_set_debugreg          8
 #define __HYPERVISOR_get_debugreg          9
 #define __HYPERVISOR_update_descriptor    10
@@ -39,10 +81,10 @@
 #define __HYPERVISOR_multicall            13
 #define __HYPERVISOR_update_va_mapping    14
 #define __HYPERVISOR_set_timer_op         15
-#define __HYPERVISOR_event_channel_op_compat 16
+#define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */
 #define __HYPERVISOR_xen_version          17
 #define __HYPERVISOR_console_io           18
-#define __HYPERVISOR_physdev_op_compat    19
+#define __HYPERVISOR_physdev_op_compat    19 /* compat since 0x00030202 */
 #define __HYPERVISOR_grant_table_op       20
 #define __HYPERVISOR_vm_assist            21
 #define __HYPERVISOR_update_va_mapping_otherdomain 22
@@ -50,15 +92,19 @@
 #define __HYPERVISOR_vcpu_op              24
 #define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
 #define __HYPERVISOR_mmuext_op            26
-#define __HYPERVISOR_acm_op               27
+#define __HYPERVISOR_xsm_op               27
 #define __HYPERVISOR_nmi_op               28
-#define __HYPERVISOR_sched_op             29
+#define __HYPERVISOR_sched_op_new         29
 #define __HYPERVISOR_callback_op          30
 #define __HYPERVISOR_xenoprof_op          31
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
 #define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_sysctl               35
+#define __HYPERVISOR_domctl               36
+#define __HYPERVISOR_kexec_op             37
 #define __HYPERVISOR_tmem_op              38
+#define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -70,16 +116,53 @@
 #define __HYPERVISOR_arch_6               54
 #define __HYPERVISOR_arch_7               55
 
+/* ` } */
+
+/*
+ * HYPERCALL COMPATIBILITY.
+ */
+
+/* New sched_op hypercall introduced in 0x00030101. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030101 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
+#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
+#else
+#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_new
+#endif
+
+/* New event-channel and physdev hypercalls introduced in 0x00030202. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030202
+#undef __HYPERVISOR_event_channel_op
+#define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat
+#undef __HYPERVISOR_physdev_op
+#define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat
+#endif
+
+/* New platform_op hypercall introduced in 0x00030204. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030204 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
+#define __HYPERVISOR_dom0_op __HYPERVISOR_platform_op
+#endif
+
 /*
  * VIRTUAL INTERRUPTS
  *
  * Virtual interrupts that a guest OS may receive from Xen.
+ *
+ * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
+ * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
+ * The latter can be allocated only once per guest: they must initially be
+ * allocated to VCPU0 but can subsequently be re-bound.
  */
-#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
-#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
-#define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
-#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
-#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
+#define VIRQ_TIMER      0  /* V. Timebase update, and/or requested timeout.  */
+#define VIRQ_DEBUG      1  /* V. Request guest to dump debug info.           */
+#define VIRQ_CONSOLE    2  /* G. (DOM0) Bytes received on emergency console. */
+#define VIRQ_DOM_EXC    3  /* G. (DOM0) Exceptional event for some domain.   */
+#define VIRQ_TBUF       4  /* G. (DOM0) Trace buffer has records available.  */
+#define VIRQ_DEBUGGER   6  /* G. (DOM0) A domain has paused for debugging.   */
+#define VIRQ_XENOPROF   7  /* V. XenOprofile interrupt: new sample available */
+#define VIRQ_CON_RING   8  /* G. (DOM0) Bytes received on console            */
+#define VIRQ_PCPU_STATE 9  /* G. (DOM0) PCPU state changed                   */
+#define VIRQ_MEM_EVENT  10 /* G. (DOM0) A memory event has occured           */
+#define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient                     */
 
 /* Architecture-specific VIRQ definitions. */
 #define VIRQ_ARCH_0    16
@@ -92,24 +175,70 @@
 #define VIRQ_ARCH_7    23
 
 #define NR_VIRQS       24
+
 /*
- * MMU-UPDATE REQUESTS
- *
- * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
- * A foreigndom (FD) can be specified (or DOMID_SELF for none).
- * Where the FD has some effect, it is described below.
- * ptr[1:0] specifies the appropriate MMU_* command.
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_mmu_update(const struct mmu_update reqs[],
+ * `                       unsigned count, unsigned *done_out,
+ * `                       unsigned foreigndom)
+ * `
+ * @reqs is an array of mmu_update_t structures ((ptr, val) pairs).
+ * @count is the length of the above array.
+ * @pdone is an output parameter indicating number of completed operations
+ * @foreigndom[15:0]: FD, the expected owner of data pages referenced in this
+ *                    hypercall invocation. Can be DOMID_SELF.
+ * @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced
+ *                     in this hypercall invocation. The value of this field
+ *                     (x) encodes the PFD as follows:
+ *                     x == 0 => PFD == DOMID_SELF
+ *                     x != 0 => PFD == x - 1
  *
+ * Sub-commands: ptr[1:0] specifies the appropriate MMU_* command.
+ * -------------
  * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
- * Updates an entry in a page table. If updating an L1 table, and the new
- * table entry is valid/present, the mapped frame must belong to the FD, if
- * an FD has been specified. If attempting to map an I/O page then the
- * caller assumes the privilege of the FD.
+ * Updates an entry in a page table belonging to PFD. If updating an L1 table,
+ * and the new table entry is valid/present, the mapped frame must belong to
+ * FD. If attempting to map an I/O page then the caller assumes the privilege
+ * of the FD.
  * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
  * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
  * ptr[:2]  -- Machine address of the page-table entry to modify.
  * val      -- Value to write.
  *
+ * There also certain implicit requirements when using this hypercall. The
+ * pages that make up a pagetable must be mapped read-only in the guest.
+ * This prevents uncontrolled guest updates to the pagetable. Xen strictly
+ * enforces this, and will disallow any pagetable update which will end up
+ * mapping pagetable page RW, and will disallow using any writable page as a
+ * pagetable. In practice it means that when constructing a page table for a
+ * process, thread, etc, we MUST be very dilligient in following these rules:
+ *  1). Start with top-level page (PGD or in Xen language: L4). Fill out
+ *      the entries.
+ *  2). Keep on going, filling out the upper (PUD or L3), and middle (PMD
+ *      or L2).
+ *  3). Start filling out the PTE table (L1) with the PTE entries. Once
+ *  	done, make sure to set each of those entries to RO (so writeable bit
+ *  	is unset). Once that has been completed, set the PMD (L2) for this
+ *  	PTE table as RO.
+ *  4). When completed with all of the PMD (L2) entries, and all of them have
+ *  	been set to RO, make sure to set RO the PUD (L3). Do the same
+ *  	operation on PGD (L4) pagetable entries that have a PUD (L3) entry.
+ *  5). Now before you can use those pages (so setting the cr3), you MUST also
+ *      pin them so that the hypervisor can verify the entries. This is done
+ *      via the HYPERVISOR_mmuext_op(MMUEXT_PIN_L4_TABLE, guest physical frame
+ *      number of the PGD (L4)). And this point the HYPERVISOR_mmuext_op(
+ *      MMUEXT_NEW_BASEPTR, guest physical frame number of the PGD (L4)) can be
+ *      issued.
+ * For 32-bit guests, the L4 is not used (as there is less pagetables), so
+ * instead use L3.
+ * At this point the pagetables can be modified using the MMU_NORMAL_PT_UPDATE
+ * hypercall. Also if so desired the OS can also try to write to the PTE
+ * and be trapped by the hypervisor (as the PTE entry is RO).
+ *
+ * To deallocate the pages, the operations are the reverse of the steps
+ * mentioned above. The argument is MMUEXT_UNPIN_TABLE for all levels and the
+ * pagetable MUST not be in use (meaning that the cr3 is not set to it).
+ *
  * ptr[1:0] == MMU_MACHPHYS_UPDATE:
  * Updates an entry in the machine->pseudo-physical mapping table.
  * ptr[:2]  -- Machine address within the frame whose mapping to modify.
@@ -119,6 +248,72 @@
  * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD:
  * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed
  * with those in @val.
+ *
+ * @val is usually the machine frame number along with some attributes.
+ * The attributes by default follow the architecture defined bits. Meaning that
+ * if this is a X86_64 machine and four page table layout is used, the layout
+ * of val is:
+ *  - 63 if set means No execute (NX)
+ *  - 46-13 the machine frame number
+ *  - 12 available for guest
+ *  - 11 available for guest
+ *  - 10 available for guest
+ *  - 9 available for guest
+ *  - 8 global
+ *  - 7 PAT (PSE is disabled, must use hypercall to make 4MB or 2MB pages)
+ *  - 6 dirty
+ *  - 5 accessed
+ *  - 4 page cached disabled
+ *  - 3 page write through
+ *  - 2 userspace accessible
+ *  - 1 writeable
+ *  - 0 present
+ *
+ *  The one bits that does not fit with the default layout is the PAGE_PSE
+ *  also called PAGE_PAT). The MMUEXT_[UN]MARK_SUPER arguments to the
+ *  HYPERVISOR_mmuext_op serve as mechanism to set a pagetable to be 4MB
+ *  (or 2MB) instead of using the PAGE_PSE bit.
+ *
+ *  The reason that the PAGE_PSE (bit 7) is not being utilized is due to Xen
+ *  using it as the Page Attribute Table (PAT) bit - for details on it please
+ *  refer to Intel SDM 10.12. The PAT allows to set the caching attributes of
+ *  pages instead of using MTRRs.
+ *
+ *  The PAT MSR is as follow (it is a 64-bit value, each entry is 8 bits):
+ *             PAT4                 PAT0
+ *   +---+----+----+----+-----+----+----+
+ *    WC | WC | WB | UC | UC- | WC | WB |  <= Linux
+ *   +---+----+----+----+-----+----+----+
+ *    WC | WT | WB | UC | UC- | WT | WB |  <= BIOS (default when machine boots)
+ *   +---+----+----+----+-----+----+----+
+ *    WC | WP | WC | UC | UC- | WT | WB |  <= Xen
+ *   +---+----+----+----+-----+----+----+
+ *
+ *  The lookup of this index table translates to looking up
+ *  Bit 7, Bit 4, and Bit 3 of val entry:
+ *
+ *  PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3).
+ *
+ *  If all bits are off, then we are using PAT0. If bit 3 turned on,
+ *  then we are using PAT1, if bit 3 and bit 4, then PAT2..
+ *
+ *  As you can see, the Linux PAT1 translates to PAT4 under Xen. Which means
+ *  that if a guest that follows Linux's PAT setup and would like to set Write
+ *  Combined on pages it MUST use PAT4 entry. Meaning that Bit 7 (PAGE_PAT) is
+ *  set. For example, under Linux it only uses PAT0, PAT1, and PAT2 for the
+ *  caching as:
+ *
+ *   WB = none (so PAT0)
+ *   WC = PWT (bit 3 on)
+ *   UC = PWT | PCD (bit 3 and 4 are on).
+ *
+ * To make it work with Xen, it needs to translate the WC bit as so:
+ *
+ *  PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3
+ *
+ * And to translate back it would:
+ *
+ * PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7.
  */
 #define MMU_NORMAL_PT_UPDATE      0 /* checked '*ptr = val'. ptr is MA.       */
 #define MMU_MACHPHYS_UPDATE       1 /* ptr = MA of frame to modify entry for  */
@@ -164,9 +359,23 @@
  * cmd: MMUEXT_FLUSH_CACHE
  * No additional arguments. Writes back and flushes cache contents.
  *
+ * cmd: MMUEXT_FLUSH_CACHE_GLOBAL
+ * No additional arguments. Writes back and flushes cache contents
+ * on all CPUs in the system.
+ *
  * cmd: MMUEXT_SET_LDT
  * linear_addr: Linear address of LDT base (NB. must be page-aligned).
  * nr_ents: Number of entries in LDT.
+ *
+ * cmd: MMUEXT_CLEAR_PAGE
+ * mfn: Machine frame number to be cleared.
+ *
+ * cmd: MMUEXT_COPY_PAGE
+ * mfn: Machine frame number of the destination page.
+ * src_mfn: Machine frame number of the source page.
+ *
+ * cmd: MMUEXT_[UN]MARK_SUPER
+ * mfn: Machine frame number of head of superpage to be [un]marked.
  */
 #define MMUEXT_PIN_L1_TABLE      0
 #define MMUEXT_PIN_L2_TABLE      1
@@ -183,13 +392,19 @@
 #define MMUEXT_FLUSH_CACHE      12
 #define MMUEXT_SET_LDT          13
 #define MMUEXT_NEW_USER_BASEPTR 15
+#define MMUEXT_CLEAR_PAGE       16
+#define MMUEXT_COPY_PAGE        17
+#define MMUEXT_FLUSH_CACHE_GLOBAL 18
+#define MMUEXT_MARK_SUPER       19
+#define MMUEXT_UNMARK_SUPER     20
 
 #ifndef __ASSEMBLY__
 struct mmuext_op {
 	unsigned int cmd;
 	union {
-		/* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
-		unsigned long mfn;
+		/* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
+		 * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */
+		xen_pfn_t     mfn;
 		/* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
 		unsigned long linear_addr;
 	} arg1;
@@ -197,10 +412,18 @@ struct mmuext_op {
 		/* SET_LDT */
 		unsigned int nr_ents;
 		/* TLB_FLUSH_MULTI, INVLPG_MULTI */
-		void *vcpumask;
+#if __XEN_INTERFACE_VERSION__ >= 0x00030205
+		XEN_GUEST_HANDLE(const_void) vcpumask;
+#else
+		const void *vcpumask;
+#endif
+		/* COPY_PAGE */
+		xen_pfn_t src_mfn;
 	} arg2;
 };
 DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
+typedef struct mmuext_op mmuext_op_t;
+DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 #endif
 
 /* These are passed as 'flags' to update_va_mapping. They can be ORed. */
@@ -225,11 +448,24 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
  */
 #define VMASST_CMD_enable                0
 #define VMASST_CMD_disable               1
+
+/* x86/32 guests: simulate full 4GB segment limits. */
 #define VMASST_TYPE_4gb_segments         0
+
+/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
 #define VMASST_TYPE_4gb_segments_notify  1
+
+/*
+ * x86 guests: support writes to bottom-level PTEs.
+ * NB1. Page-directory entries cannot be written.
+ * NB2. Guest must continue to remove all writable mappings of PTEs.
+ */
 #define VMASST_TYPE_writable_pagetables  2
+
+/* x86/PAE guests: support PDPTs above 4GB. */
 #define VMASST_TYPE_pae_extended_cr3     3
-#define MAX_VMASST_TYPE 3
+
+#define MAX_VMASST_TYPE                  3
 
 #ifndef __ASSEMBLY__
 
@@ -261,6 +497,16 @@ typedef uint16_t domid_t;
 #define DOMID_XEN  (0x7FF2U)
 
 /*
+ * DOMID_COW is used as the owner of sharable pages */
+#define DOMID_COW  (0x7FF3U)
+
+/* DOMID_INVALID is used to identify pages with unknown owner. */
+#define DOMID_INVALID (0x7FF4U)
+
+/* Idle domain. */
+#define DOMID_IDLE (0x7FFFU)
+
+/*
  * Send an array of these to HYPERVISOR_mmu_update().
  * NB. The fields are natural pointer/address size for this architecture.
  */
@@ -269,6 +515,8 @@ struct mmu_update {
     uint64_t val;       /* New contents of PTE.    */
 };
 DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
+typedef struct mmu_update mmu_update_t;
+DEFINE_XEN_GUEST_HANDLE(mmu_update_t);
 
 /*
  * Send an array of these to HYPERVISOR_multicall().
@@ -276,10 +524,16 @@ DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
  */
 struct multicall_entry {
     unsigned long op;
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+    unsigned long result;
+#else
     long result;
+#endif
     unsigned long args[6];
 };
 DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
+typedef struct multicall_entry multicall_entry_t;
+DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
 
 /*
  * Event channel endpoints per domain:
@@ -312,6 +566,7 @@ struct vcpu_time_info {
 	int8_t   tsc_shift;
 	int8_t   pad1[3];
 }; /* 32 bytes */
+typedef struct vcpu_time_info vcpu_time_info_t;
 
 struct vcpu_info {
 	/*
@@ -343,15 +598,26 @@ struct vcpu_info {
 	uint8_t evtchn_upcall_mask;
 	unsigned long evtchn_pending_sel;
 	struct arch_vcpu_info arch;
+#ifdef CONFIG_PARAVIRT_XEN
 	struct pvclock_vcpu_time_info time;
+#else
+	struct vcpu_time_info time;
+#endif
 }; /* 64 bytes (x86) */
+#ifndef __XEN__
+typedef struct vcpu_info vcpu_info_t;
+#endif
 
 /*
  * Xen/kernel shared data -- pointer provided in start_info.
- * NB. We expect that this struct is smaller than a page.
+ *
+ * This structure is defined to be both smaller than a page, and the
+ * only data on the shared page, but may vary in actual size even within
+ * compatible Xen versions; guests should not rely on the size
+ * of this structure remaining constant.
  */
 struct shared_info {
-	struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
+	struct vcpu_info vcpu_info[XEN_LEGACY_MAX_VCPUS];
 
 	/*
 	 * A domain can create "event channels" on which it can send and receive
@@ -391,33 +657,41 @@ struct shared_info {
 	 * Wallclock time: updated only by control software. Guests should base
 	 * their gettimeofday() syscall on this wallclock-base value.
 	 */
-	struct pvclock_wall_clock wc;
+#ifdef CONFIG_PARAVIRT_XEN
+    struct pvclock_wall_clock wc;
+#else
+    uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
+    uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
+    uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
+#endif
 
-	struct arch_shared_info arch;
+    struct arch_shared_info arch;
 
 };
+#ifndef __XEN__
+typedef struct shared_info shared_info_t;
+#endif
 
 /*
- * Start-of-day memory layout for the initial domain (DOM0):
+ * Start-of-day memory layout:
  *  1. The domain is started within contiguous virtual-memory region.
- *  2. The contiguous region begins and ends on an aligned 4MB boundary.
- *  3. The region start corresponds to the load address of the OS image.
- *     If the load address is not 4MB aligned then the address is rounded down.
- *  4. This the order of bootstrap elements in the initial virtual region:
+ *  2. The contiguous region ends on an aligned 4MB boundary.
+ *  3. This the order of bootstrap elements in the initial virtual region:
  *      a. relocated kernel image
  *      b. initial ram disk              [mod_start, mod_len]
  *      c. list of allocated page frames [mfn_list, nr_pages]
+ *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
  *      d. start_info_t structure        [register ESI (x86)]
  *      e. bootstrap page tables         [pt_base, CR3 (x86)]
  *      f. bootstrap stack               [register ESP (x86)]
- *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
- *  6. The initial ram disk may be omitted.
- *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
+ *  4. Bootstrap elements are packed together, but each is 4kB-aligned.
+ *  5. The initial ram disk may be omitted.
+ *  6. The list of page frames forms a contiguous 'pseudo-physical' memory
  *     layout for the domain. In particular, the bootstrap virtual-memory
  *     region is a 1:1 mapping to the first section of the pseudo-physical map.
- *  8. All bootstrap elements are mapped read-writable for the guest OS. The
+ *  7. All bootstrap elements are mapped read-writable for the guest OS. The
  *     only exception is the bootstrap page table, which is mapped read-only.
- *  9. There is guaranteed to be at least 512kB padding after the final
+ *  8. There is guaranteed to be at least 512kB padding after the final
  *     bootstrap element. If necessary, the bootstrap virtual region is
  *     extended by an extra 4MB to ensure this.
  */
@@ -429,11 +703,11 @@ struct start_info {
 	unsigned long nr_pages;     /* Total pages allocated to this domain.  */
 	unsigned long shared_info;  /* MACHINE address of shared info struct. */
 	uint32_t flags;             /* SIF_xxx flags.                         */
-	unsigned long store_mfn;    /* MACHINE page number of shared page.    */
+	xen_pfn_t store_mfn;        /* MACHINE page number of shared page.    */
 	uint32_t store_evtchn;      /* Event channel for store communication. */
 	union {
 		struct {
-			unsigned long mfn;  /* MACHINE page number of console page.   */
+			xen_pfn_t mfn;      /* MACHINE page number of console page.   */
 			uint32_t  evtchn;   /* Event channel for console page.        */
 		} domU;
 		struct {
@@ -448,53 +722,93 @@ struct start_info {
 	unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
 	unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
 	int8_t cmd_line[MAX_GUEST_CMDLINE];
+	/* The pfn range here covers both page table and p->m table frames.   */
+	unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
+	unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
 };
+typedef struct start_info start_info_t;
 
-struct dom0_vga_console_info {
-	uint8_t video_type;
-#define XEN_VGATYPE_TEXT_MODE_3 0x03
-#define XEN_VGATYPE_VESA_LFB    0x23
-
-	union {
-		struct {
-			/* Font height, in pixels. */
-			uint16_t font_height;
-			/* Cursor location (column, row). */
-			uint16_t cursor_x, cursor_y;
-			/* Number of rows and columns (dimensions in characters). */
-			uint16_t rows, columns;
-		} text_mode_3;
-
-		struct {
-			/* Width and height, in pixels. */
-			uint16_t width, height;
-			/* Bytes per scan line. */
-			uint16_t bytes_per_line;
-			/* Bits per pixel. */
-			uint16_t bits_per_pixel;
-			/* LFB physical address, and size (in units of 64kB). */
-			uint32_t lfb_base;
-			uint32_t lfb_size;
-			/* RGB mask offsets and sizes, as defined by VBE 1.2+ */
-			uint8_t  red_pos, red_size;
-			uint8_t  green_pos, green_size;
-			uint8_t  blue_pos, blue_size;
-			uint8_t  rsvd_pos, rsvd_size;
-
-			/* VESA capabilities (offset 0xa, VESA command 0x4f00). */
-			uint32_t gbl_caps;
-			/* Mode attributes (offset 0x0, VESA command 0x4f01). */
-			uint16_t mode_attrs;
-		} vesa_lfb;
-	} u;
-};
+/* New console union for dom0 introduced in 0x00030203. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030203
+#define console_mfn    console.domU.mfn
+#define console_evtchn console.domU.evtchn
+#endif
 
 /* These flags are passed in the 'flags' field of start_info_t. */
 #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
 #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
+#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
+#define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
 #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
 
-typedef uint64_t cpumap_t;
+/*
+ * A multiboot module is a package containing modules very similar to a
+ * multiboot module array. The only differences are:
+ * - the array of module descriptors is by convention simply at the beginning
+ *   of the multiboot module,
+ * - addresses in the module descriptors are based on the beginning of the
+ *   multiboot module,
+ * - the number of modules is determined by a termination descriptor that has
+ *   mod_start == 0.
+ *
+ * This permits to both build it statically and reference it in a configuration
+ * file, and let the PV guest easily rebase the addresses to virtual addresses
+ * and at the same time count the number of modules.
+ */
+struct xen_multiboot_mod_list
+{
+    /* Address of first byte of the module */
+    uint32_t mod_start;
+    /* Address of last byte of the module (inclusive) */
+    uint32_t mod_end;
+    /* Address of zero-terminated command line */
+    uint32_t cmdline;
+    /* Unused, must be zero */
+    uint32_t pad;
+};
+
+typedef struct dom0_vga_console_info {
+    uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
+#define XEN_VGATYPE_TEXT_MODE_3 0x03
+#define XEN_VGATYPE_VESA_LFB    0x23
+#define XEN_VGATYPE_EFI_LFB     0x70
+
+    union {
+        struct {
+            /* Font height, in pixels. */
+            uint16_t font_height;
+            /* Cursor location (column, row). */
+            uint16_t cursor_x, cursor_y;
+            /* Number of rows and columns (dimensions in characters). */
+            uint16_t rows, columns;
+        } text_mode_3;
+
+        struct {
+            /* Width and height, in pixels. */
+            uint16_t width, height;
+            /* Bytes per scan line. */
+            uint16_t bytes_per_line;
+            /* Bits per pixel. */
+            uint16_t bits_per_pixel;
+            /* LFB physical address, and size (in units of 64kB). */
+            uint32_t lfb_base;
+            uint32_t lfb_size;
+            /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
+            uint8_t  red_pos, red_size;
+            uint8_t  green_pos, green_size;
+            uint8_t  blue_pos, blue_size;
+            uint8_t  rsvd_pos, rsvd_size;
+#if __XEN_INTERFACE_VERSION__ >= 0x00030206 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
+            /* VESA capabilities (offset 0xa, VESA command 0x4f00). */
+            uint32_t gbl_caps;
+            /* Mode attributes (offset 0x0, VESA command 0x4f01). */
+            uint16_t mode_attrs;
+#endif
+        } vesa_lfb;
+    } u;
+} dom0_vga_console_info_t;
+#define xen_vga_console_info dom0_vga_console_info
+#define xen_vga_console_info_t dom0_vga_console_info_t
 
 typedef uint8_t xen_domain_handle_t[16];
 
@@ -502,28 +816,10 @@ typedef uint8_t xen_domain_handle_t[16];
 #define __mk_unsigned_long(x) x ## UL
 #define mk_unsigned_long(x) __mk_unsigned_long(x)
 
-#define TMEM_SPEC_VERSION 1
-
-struct tmem_op {
-	uint32_t cmd;
-	int32_t pool_id;
-	union {
-		struct {  /* for cmd == TMEM_NEW_POOL */
-			uint64_t uuid[2];
-			uint32_t flags;
-		} new;
-		struct {
-			uint64_t oid[3];
-			uint32_t index;
-			uint32_t tmem_offset;
-			uint32_t pfn_offset;
-			uint32_t len;
-			GUEST_HANDLE(void) gmfn; /* guest machine page frame */
-		} gen;
-	} u;
-};
-
-DEFINE_GUEST_HANDLE(u64);
+__DEFINE_XEN_GUEST_HANDLE(uint8,  uint8_t);
+__DEFINE_XEN_GUEST_HANDLE(uint16, uint16_t);
+__DEFINE_XEN_GUEST_HANDLE(uint32, uint32_t);
+__DEFINE_XEN_GUEST_HANDLE(uint64, uint64_t);
 
 #else /* __ASSEMBLY__ */
 
@@ -532,4 +828,23 @@ DEFINE_GUEST_HANDLE(u64);
 
 #endif /* !__ASSEMBLY__ */
 
+/* Default definitions for macros used by domctl/sysctl. */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+#ifndef uint64_aligned_t
+#define uint64_aligned_t uint64_t
+#endif
+#ifndef XEN_GUEST_HANDLE_64
+#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name)
+#endif
+
+#ifndef __ASSEMBLY__
+struct xenctl_cpumap {
+    XEN_GUEST_HANDLE_64(uint8) bitmap;
+    uint32_t nr_cpus;
+};
+#endif
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
 #endif /* __XEN_PUBLIC_XEN_H__ */
diff --git a/include/xen/interface/xenoprof.h b/include/xen/interface/xenoprof.h
new file mode 100644
index 0000000..a0c6987
--- /dev/null
+++ b/include/xen/interface/xenoprof.h
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * xenoprof.h
+ * 
+ * Interface for enabling system wide profiling based on hardware performance
+ * counters
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ * Written by Aravind Menon & Jose Renato Santos
+ */
+
+#ifndef __XEN_PUBLIC_XENOPROF_H__
+#define __XEN_PUBLIC_XENOPROF_H__
+
+#include "xen.h"
+
+/*
+ * Commands to HYPERVISOR_xenoprof_op().
+ */
+#define XENOPROF_init                0
+#define XENOPROF_reset_active_list   1
+#define XENOPROF_reset_passive_list  2
+#define XENOPROF_set_active          3
+#define XENOPROF_set_passive         4
+#define XENOPROF_reserve_counters    5
+#define XENOPROF_counter             6
+#define XENOPROF_setup_events        7
+#define XENOPROF_enable_virq         8
+#define XENOPROF_start               9
+#define XENOPROF_stop               10
+#define XENOPROF_disable_virq       11
+#define XENOPROF_release_counters   12
+#define XENOPROF_shutdown           13
+#define XENOPROF_get_buffer         14
+#define XENOPROF_set_backtrace      15
+
+/* AMD IBS support */
+#define XENOPROF_get_ibs_caps       16
+#define XENOPROF_ibs_counter        17
+#define XENOPROF_last_op            17
+
+#define MAX_OPROF_EVENTS    32
+#define MAX_OPROF_DOMAINS   25
+#define XENOPROF_CPU_TYPE_SIZE 64
+
+/* Xenoprof performance events (not Xen events) */
+struct event_log {
+    uint64_t eip;
+    uint8_t mode;
+    uint8_t event;
+};
+
+/* PC value that indicates a special code */
+#define XENOPROF_ESCAPE_CODE (~0ULL)
+/* Transient events for the xenoprof->oprofile cpu buf */
+#define XENOPROF_TRACE_BEGIN 1
+
+/* Xenoprof buffer shared between Xen and domain - 1 per VCPU */
+struct xenoprof_buf {
+    uint32_t event_head;
+    uint32_t event_tail;
+    uint32_t event_size;
+    uint32_t vcpu_id;
+    uint64_t xen_samples;
+    uint64_t kernel_samples;
+    uint64_t user_samples;
+    uint64_t lost_samples;
+    struct event_log event_log[1];
+};
+#ifndef __XEN__
+typedef struct xenoprof_buf xenoprof_buf_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t);
+#endif
+
+struct xenoprof_init {
+    int32_t  num_events;
+    int32_t  is_primary;
+    char cpu_type[XENOPROF_CPU_TYPE_SIZE];
+};
+typedef struct xenoprof_init xenoprof_init_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t);
+
+struct xenoprof_get_buffer {
+    int32_t  max_samples;
+    int32_t  nbuf;
+    int32_t  bufsize;
+    uint64_t buf_gmaddr;
+};
+typedef struct xenoprof_get_buffer xenoprof_get_buffer_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_get_buffer_t);
+
+struct xenoprof_counter {
+    uint32_t ind;
+    uint64_t count;
+    uint32_t enabled;
+    uint32_t event;
+    uint32_t hypervisor;
+    uint32_t kernel;
+    uint32_t user;
+    uint64_t unit_mask;
+};
+typedef struct xenoprof_counter xenoprof_counter_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t);
+
+typedef struct xenoprof_passive {
+    uint16_t domain_id;
+    int32_t  max_samples;
+    int32_t  nbuf;
+    int32_t  bufsize;
+    uint64_t buf_gmaddr;
+} xenoprof_passive_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t);
+
+struct xenoprof_ibs_counter {
+    uint64_t op_enabled;
+    uint64_t fetch_enabled;
+    uint64_t max_cnt_fetch;
+    uint64_t max_cnt_op;
+    uint64_t rand_en;
+    uint64_t dispatched_ops;
+};
+typedef struct xenoprof_ibs_counter xenoprof_ibs_counter_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_ibs_counter_t);
+
+#endif /* __XEN_PUBLIC_XENOPROF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/xsm/acm.h b/include/xen/interface/xsm/acm.h
new file mode 100644
index 0000000..b6ac8d5
--- /dev/null
+++ b/include/xen/interface/xsm/acm.h
@@ -0,0 +1,223 @@
+/*
+ * acm.h: Xen access control module interface defintions
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Copyright (c) 2005, International Business Machines Corporation.
+ */
+
+#ifndef _XEN_PUBLIC_ACM_H
+#define _XEN_PUBLIC_ACM_H
+
+#include "../xen.h"
+
+/* default ssid reference value if not supplied */
+#define ACM_DEFAULT_SSID  0x0
+#define ACM_DEFAULT_LOCAL_SSID  0x0
+
+/* Internal ACM ERROR types */
+#define ACM_OK     0
+#define ACM_UNDEF   -1
+#define ACM_INIT_SSID_ERROR  -2
+#define ACM_INIT_SOID_ERROR  -3
+#define ACM_ERROR          -4
+
+/* External ACCESS DECISIONS */
+#define ACM_ACCESS_PERMITTED        0
+#define ACM_ACCESS_DENIED           -111
+#define ACM_NULL_POINTER_ERROR      -200
+
+/*
+   Error codes reported in when trying to test for a new policy
+   These error codes are reported in an array of tuples where
+   each error code is followed by a parameter describing the error
+   more closely, such as a domain id.
+*/
+#define ACM_EVTCHN_SHARING_VIOLATION       0x100
+#define ACM_GNTTAB_SHARING_VIOLATION       0x101
+#define ACM_DOMAIN_LOOKUP                  0x102
+#define ACM_CHWALL_CONFLICT                0x103
+#define ACM_SSIDREF_IN_USE                 0x104
+
+
+/* primary policy in lower 4 bits */
+#define ACM_NULL_POLICY 0
+#define ACM_CHINESE_WALL_POLICY 1
+#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
+#define ACM_POLICY_UNDEFINED 15
+
+/* combinations have secondary policy component in higher 4bit */
+#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
+    ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
+
+/* policy: */
+#define ACM_POLICY_NAME(X) \
+ ((X) == (ACM_NULL_POLICY)) ? "NULL" :                        \
+    ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" :        \
+    ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \
+    ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \
+     "UNDEFINED"
+
+/* the following policy versions must be increased
+ * whenever the interpretation of the related
+ * policy's data structure changes
+ */
+#define ACM_POLICY_VERSION 4
+#define ACM_CHWALL_VERSION 1
+#define ACM_STE_VERSION  1
+
+/* defines a ssid reference used by xen */
+typedef uint32_t ssidref_t;
+
+/* hooks that are known to domains */
+#define ACMHOOK_none          0
+#define ACMHOOK_sharing       1
+#define ACMHOOK_authorization 2
+#define ACMHOOK_conflictset   3
+
+/* -------security policy relevant type definitions-------- */
+
+/* type identifier; compares to "equal" or "not equal" */
+typedef uint16_t domaintype_t;
+
+/* CHINESE WALL POLICY DATA STRUCTURES
+ *
+ * current accumulated conflict type set:
+ * When a domain is started and has a type that is in
+ * a conflict set, the conflicting types are incremented in
+ * the aggregate set. When a domain is destroyed, the 
+ * conflicting types to its type are decremented.
+ * If a domain has multiple types, this procedure works over
+ * all those types.
+ *
+ * conflict_aggregate_set[i] holds the number of
+ *   running domains that have a conflict with type i.
+ *
+ * running_types[i] holds the number of running domains
+ *        that include type i in their ssidref-referenced type set
+ *
+ * conflict_sets[i][j] is "0" if type j has no conflict
+ *    with type i and is "1" otherwise.
+ */
+/* high-16 = version, low-16 = check magic */
+#define ACM_MAGIC  0x0001debc
+
+/* size of the SHA1 hash identifying the XML policy from which the
+   binary policy was created */
+#define ACM_SHA1_HASH_SIZE    20
+
+/* each offset in bytes from start of the struct they
+ * are part of */
+
+/* V3 of the policy buffer aded a version structure */
+struct acm_policy_version
+{
+    uint32_t major;
+    uint32_t minor;
+};
+
+
+/* each buffer consists of all policy information for
+ * the respective policy given in the policy code
+ *
+ * acm_policy_buffer, acm_chwall_policy_buffer,
+ * and acm_ste_policy_buffer need to stay 32-bit aligned
+ * because we create binary policies also with external
+ * tools that assume packed representations (e.g. the java tool)
+ */
+struct acm_policy_buffer {
+    uint32_t magic;
+    uint32_t policy_version; /* ACM_POLICY_VERSION */
+    uint32_t len;
+    uint32_t policy_reference_offset;
+    uint32_t primary_policy_code;
+    uint32_t primary_buffer_offset;
+    uint32_t secondary_policy_code;
+    uint32_t secondary_buffer_offset;
+    struct acm_policy_version xml_pol_version; /* add in V3 */
+    uint8_t xml_policy_hash[ACM_SHA1_HASH_SIZE]; /* added in V4 */
+};
+
+
+struct acm_policy_reference_buffer {
+    uint32_t len;
+};
+
+struct acm_chwall_policy_buffer {
+    uint32_t policy_version; /* ACM_CHWALL_VERSION */
+    uint32_t policy_code;
+    uint32_t chwall_max_types;
+    uint32_t chwall_max_ssidrefs;
+    uint32_t chwall_max_conflictsets;
+    uint32_t chwall_ssid_offset;
+    uint32_t chwall_conflict_sets_offset;
+    uint32_t chwall_running_types_offset;
+    uint32_t chwall_conflict_aggregate_offset;
+};
+
+struct acm_ste_policy_buffer {
+    uint32_t policy_version; /* ACM_STE_VERSION */
+    uint32_t policy_code;
+    uint32_t ste_max_types;
+    uint32_t ste_max_ssidrefs;
+    uint32_t ste_ssid_offset;
+};
+
+struct acm_stats_buffer {
+    uint32_t magic;
+    uint32_t len;
+    uint32_t primary_policy_code;
+    uint32_t primary_stats_offset;
+    uint32_t secondary_policy_code;
+    uint32_t secondary_stats_offset;
+};
+
+struct acm_ste_stats_buffer {
+    uint32_t ec_eval_count;
+    uint32_t gt_eval_count;
+    uint32_t ec_denied_count;
+    uint32_t gt_denied_count;
+    uint32_t ec_cachehit_count;
+    uint32_t gt_cachehit_count;
+};
+
+struct acm_ssid_buffer {
+    uint32_t len;
+    ssidref_t ssidref;
+    uint32_t policy_reference_offset;
+    uint32_t primary_policy_code;
+    uint32_t primary_max_types;
+    uint32_t primary_types_offset;
+    uint32_t secondary_policy_code;
+    uint32_t secondary_max_types;
+    uint32_t secondary_types_offset;
+};
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/xsm/acm_ops.h b/include/xen/interface/xsm/acm_ops.h
new file mode 100644
index 0000000..1fef7a0
--- /dev/null
+++ b/include/xen/interface/xsm/acm_ops.h
@@ -0,0 +1,159 @@
+/*
+ * acm_ops.h: Xen access control module hypervisor commands
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Copyright (c) 2005,2006 International Business Machines Corporation.
+ */
+
+#ifndef __XEN_PUBLIC_ACM_OPS_H__
+#define __XEN_PUBLIC_ACM_OPS_H__
+
+#include "../xen.h"
+#include "acm.h"
+
+/*
+ * Make sure you increment the interface version whenever you modify this file!
+ * This makes sure that old versions of acm tools will stop working in a
+ * well-defined way (rather than crashing the machine, for instance).
+ */
+#define ACM_INTERFACE_VERSION   0xAAAA000A
+
+/************************************************************************/
+
+/*
+ * Prototype for this hypercall is:
+ *  int acm_op(int cmd, void *args)
+ * @cmd  == ACMOP_??? (access control module operation).
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+
+#define ACMOP_setpolicy         1
+struct acm_setpolicy {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) pushcache;
+    uint32_t pushcache_size;
+};
+
+
+#define ACMOP_getpolicy         2
+struct acm_getpolicy {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) pullcache;
+    uint32_t pullcache_size;
+};
+
+
+#define ACMOP_dumpstats         3
+struct acm_dumpstats {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) pullcache;
+    uint32_t pullcache_size;
+};
+
+
+#define ACMOP_getssid           4
+#define ACM_GETBY_ssidref  1
+#define ACM_GETBY_domainid 2
+struct acm_getssid {
+    /* IN */
+    uint32_t get_ssid_by; /* ACM_GETBY_* */
+    union {
+        domaintype_t domainid;
+        ssidref_t    ssidref;
+    } id;
+    XEN_GUEST_HANDLE_64(void) ssidbuf;
+    uint32_t ssidbuf_size;
+};
+
+#define ACMOP_getdecision      5
+struct acm_getdecision {
+    /* IN */
+    uint32_t get_decision_by1; /* ACM_GETBY_* */
+    uint32_t get_decision_by2; /* ACM_GETBY_* */
+    union {
+        domaintype_t domainid;
+        ssidref_t    ssidref;
+    } id1;
+    union {
+        domaintype_t domainid;
+        ssidref_t    ssidref;
+    } id2;
+    uint32_t hook;
+    /* OUT */
+    uint32_t acm_decision;
+};
+
+
+#define ACMOP_chgpolicy        6
+struct acm_change_policy {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) policy_pushcache;
+    uint32_t policy_pushcache_size;
+    XEN_GUEST_HANDLE_64(void) del_array;
+    uint32_t delarray_size;
+    XEN_GUEST_HANDLE_64(void) chg_array;
+    uint32_t chgarray_size;
+    /* OUT */
+    /* array with error code */
+    XEN_GUEST_HANDLE_64(void) err_array;
+    uint32_t errarray_size;
+};
+
+#define ACMOP_relabeldoms       7
+struct acm_relabel_doms {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) relabel_map;
+    uint32_t relabel_map_size;
+    /* OUT */
+    XEN_GUEST_HANDLE_64(void) err_array;
+    uint32_t errarray_size;
+};
+
+/* future interface to Xen */
+struct xen_acmctl {
+    uint32_t cmd;
+    uint32_t interface_version;
+    union {
+        struct acm_setpolicy     setpolicy;
+        struct acm_getpolicy     getpolicy;
+        struct acm_dumpstats     dumpstats;
+        struct acm_getssid       getssid;
+        struct acm_getdecision   getdecision;
+        struct acm_change_policy change_policy;
+        struct acm_relabel_doms  relabel_doms;
+    } u;
+};
+
+typedef struct xen_acmctl xen_acmctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_acmctl_t);
+
+#endif /* __XEN_PUBLIC_ACM_OPS_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/xsm/flask_op.h b/include/xen/interface/xsm/flask_op.h
new file mode 100644
index 0000000..83dcd99
--- /dev/null
+++ b/include/xen/interface/xsm/flask_op.h
@@ -0,0 +1,184 @@
+/*
+ *  This file contains the flask_op hypercall commands and definitions.
+ *
+ *  Author:  George Coker, <gscoker@alpha.ncsc.mil>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __FLASK_OP_H__
+#define __FLASK_OP_H__
+
+#define XEN_FLASK_INTERFACE_VERSION 1
+
+struct xen_flask_load {
+    XEN_GUEST_HANDLE(char) buffer;
+    uint32_t size;
+};
+
+struct xen_flask_setenforce {
+    uint32_t enforcing;
+};
+
+struct xen_flask_sid_context {
+    /* IN/OUT: sid to convert to/from string */
+    uint32_t sid;
+    /* IN: size of the context buffer
+     * OUT: actual size of the output context string
+     */
+    uint32_t size;
+    XEN_GUEST_HANDLE(char) context;
+};
+
+struct xen_flask_access {
+    /* IN: access request */
+    uint32_t ssid;
+    uint32_t tsid;
+    uint32_t tclass;
+    uint32_t req;
+    /* OUT: AVC data */
+    uint32_t allowed;
+    uint32_t audit_allow;
+    uint32_t audit_deny;
+    uint32_t seqno;
+};
+
+struct xen_flask_transition {
+    /* IN: transition SIDs and class */
+    uint32_t ssid;
+    uint32_t tsid;
+    uint32_t tclass;
+    /* OUT: new SID */
+    uint32_t newsid;
+};
+
+struct xen_flask_userlist {
+    /* IN: starting SID for list */
+    uint32_t start_sid;
+    /* IN: size of user string and output buffer
+     * OUT: number of SIDs returned */
+    uint32_t size;
+    union {
+        /* IN: user to enumerate SIDs */
+        XEN_GUEST_HANDLE(char) user;
+        /* OUT: SID list */
+        XEN_GUEST_HANDLE(uint32) sids;
+    } u;
+};
+
+struct xen_flask_boolean {
+    /* IN/OUT: numeric identifier for boolean [GET/SET]
+     * If -1, name will be used and bool_id will be filled in. */
+    uint32_t bool_id;
+    /* OUT: current enforcing value of boolean [GET/SET] */
+    uint8_t enforcing;
+    /* OUT: pending value of boolean [GET/SET] */
+    uint8_t pending;
+    /* IN: new value of boolean [SET] */
+    uint8_t new_value;
+    /* IN: commit new value instead of only setting pending [SET] */
+    uint8_t commit;
+    /* IN: size of boolean name buffer [GET/SET]
+     * OUT: actual size of name [GET only] */
+    uint32_t size;
+    /* IN: if bool_id is -1, used to find boolean [GET/SET]
+     * OUT: textual name of boolean [GET only]
+     */
+    XEN_GUEST_HANDLE(char) name;
+};
+
+struct xen_flask_setavc_threshold {
+    /* IN */
+    uint32_t threshold;
+};
+
+struct xen_flask_hash_stats {
+    /* OUT */
+    uint32_t entries;
+    uint32_t buckets_used;
+    uint32_t buckets_total;
+    uint32_t max_chain_len;
+};
+
+struct xen_flask_cache_stats {
+    /* IN */
+    uint32_t cpu;
+    /* OUT */
+    uint32_t lookups;
+    uint32_t hits;
+    uint32_t misses;
+    uint32_t allocations;
+    uint32_t reclaims;
+    uint32_t frees;
+};
+
+struct xen_flask_ocontext {
+    /* IN */
+    uint32_t ocon;
+    uint32_t sid;
+    uint64_t low, high;
+};
+
+struct xen_flask_op {
+    uint32_t cmd;
+#define FLASK_LOAD              1
+#define FLASK_GETENFORCE        2
+#define FLASK_SETENFORCE        3
+#define FLASK_CONTEXT_TO_SID    4
+#define FLASK_SID_TO_CONTEXT    5
+#define FLASK_ACCESS            6
+#define FLASK_CREATE            7
+#define FLASK_RELABEL           8
+#define FLASK_USER              9
+#define FLASK_POLICYVERS        10
+#define FLASK_GETBOOL           11
+#define FLASK_SETBOOL           12
+#define FLASK_COMMITBOOLS       13
+#define FLASK_MLS               14
+#define FLASK_DISABLE           15
+#define FLASK_GETAVC_THRESHOLD  16
+#define FLASK_SETAVC_THRESHOLD  17
+#define FLASK_AVC_HASHSTATS     18
+#define FLASK_AVC_CACHESTATS    19
+#define FLASK_MEMBER            20
+#define FLASK_ADD_OCONTEXT      21
+#define FLASK_DEL_OCONTEXT      22
+    uint32_t interface_version; /* XEN_FLASK_INTERFACE_VERSION */
+    union {
+        struct xen_flask_load load;
+        struct xen_flask_setenforce enforce;
+        /* FLASK_CONTEXT_TO_SID and FLASK_SID_TO_CONTEXT */
+        struct xen_flask_sid_context sid_context;
+        struct xen_flask_access access;
+        /* FLASK_CREATE, FLASK_RELABEL, FLASK_MEMBER */
+        struct xen_flask_transition transition;
+        struct xen_flask_userlist userlist;
+        /* FLASK_GETBOOL, FLASK_SETBOOL */
+        struct xen_flask_boolean boolean;
+        struct xen_flask_setavc_threshold setavc_threshold;
+        struct xen_flask_hash_stats hash_stats;
+        struct xen_flask_cache_stats cache_stats;
+        /* FLASK_ADD_OCONTEXT, FLASK_DEL_OCONTEXT */
+        struct xen_flask_ocontext ocontext;
+    } u;
+};
+typedef struct xen_flask_op xen_flask_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_flask_op_t);
+
+#endif
diff --git a/include/xen/net-util.h b/include/xen/net-util.h
new file mode 100644
index 0000000..8561e2c
--- /dev/null
+++ b/include/xen/net-util.h
@@ -0,0 +1,75 @@
+#ifndef __XEN_NETUTIL_H__
+#define __XEN_NETUTIL_H__
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/ip.h>
+
+static inline int skb_checksum_setup(struct sk_buff *skb,
+				     unsigned long *fixup_counter)
+{
+ 	struct iphdr *iph = (void *)skb->data;
+	unsigned char *th;
+	__be16 *csum = NULL;
+	int err = -EPROTO;
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		/* A non-CHECKSUM_PARTIAL SKB does not require setup. */
+		if (!skb_is_gso(skb))
+			return 0;
+
+		/*
+		 * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
+		 * peers can fail to set NETRXF_csum_blank when sending a GSO
+		 * frame. In this case force the SKB to CHECKSUM_PARTIAL and
+		 * recalculate the partial checksum.
+		 */
+		++*fixup_counter;
+		--csum;
+	}
+
+	if (skb->protocol != htons(ETH_P_IP))
+		goto out;
+
+	th = skb->data + 4 * iph->ihl;
+	if (th >= skb_tail_pointer(skb))
+		goto out;
+
+	skb->csum_start = th - skb->head;
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		skb->csum_offset = offsetof(struct tcphdr, check);
+		if (csum)
+			csum = &((struct tcphdr *)th)->check;
+		break;
+	case IPPROTO_UDP:
+		skb->csum_offset = offsetof(struct udphdr, check);
+		if (csum)
+			csum = &((struct udphdr *)th)->check;
+		break;
+	default:
+		if (net_ratelimit())
+			pr_err("Attempting to checksum a non-"
+			       "TCP/UDP packet, dropping a protocol"
+			       " %d packet\n", iph->protocol);
+		goto out;
+	}
+
+	if ((th + skb->csum_offset + sizeof(*csum)) > skb_tail_pointer(skb))
+		goto out;
+
+	if (csum) {
+		*csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+					   skb->len - iph->ihl*4,
+					   IPPROTO_TCP, 0);
+		skb->ip_summed = CHECKSUM_PARTIAL;
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+#endif /* __XEN_NETUTIL_H__ */
diff --git a/include/xen/pcifront.h b/include/xen/pcifront.h
new file mode 100644
index 0000000..bb5713f
--- /dev/null
+++ b/include/xen/pcifront.h
@@ -0,0 +1,69 @@
+/*
+ * PCI Frontend - arch-dependendent declarations
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_ASM_PCIFRONT_H__
+#define __XEN_ASM_PCIFRONT_H__
+
+#include <linux/spinlock.h>
+
+#ifdef __KERNEL__
+
+#ifndef __ia64__
+
+#include <asm/pci.h>
+
+struct pcifront_device;
+struct pci_bus;
+#define pcifront_sd pci_sysdata
+
+static inline struct pcifront_device *
+pcifront_get_pdev(struct pcifront_sd *sd)
+{
+	return sd->pdev;
+}
+
+static inline void pcifront_init_sd(struct pcifront_sd *sd,
+				    unsigned int domain, unsigned int bus,
+				    struct pcifront_device *pdev)
+{
+	sd->domain = domain;
+	sd->pdev = pdev;
+}
+
+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
+						 struct pcifront_sd *sd)
+{
+}
+
+#else /* __ia64__ */
+
+#include <linux/acpi.h>
+#include <asm/pci.h>
+#define pcifront_sd pci_controller
+
+extern void xen_add_resource(struct pci_controller *, unsigned int,
+			     unsigned int, struct acpi_resource *);
+extern void xen_pcibios_setup_root_windows(struct pci_bus *,
+					   struct pci_controller *);
+
+static inline struct pcifront_device *
+pcifront_get_pdev(struct pcifront_sd *sd)
+{
+	return (struct pcifront_device *)sd->platform_data;
+}
+
+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
+						 struct pcifront_sd *sd)
+{
+	xen_pcibios_setup_root_windows(bus, sd);
+}
+
+#endif /* __ia64__ */
+
+extern struct rw_semaphore pci_bus_sem;
+
+#endif /* __KERNEL__ */
+
+#endif /* __XEN_ASM_PCIFRONT_H__ */
diff --git a/include/xen/pcpu.h b/include/xen/pcpu.h
new file mode 100644
index 0000000..47518e0
--- /dev/null
+++ b/include/xen/pcpu.h
@@ -0,0 +1,19 @@
+#ifndef _XEN_SYSCTL_H
+#define _XEN_SYSCTL_H
+
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+
+int register_pcpu_notifier(struct notifier_block *);
+void unregister_pcpu_notifier(struct notifier_block *);
+
+#ifdef CONFIG_X86
+int __must_check rdmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no,
+				    u32 *l, u32 *h);
+int __must_check wrmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no,
+				    u32 l, u32 h);
+int __must_check rdmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs);
+int __must_check wrmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs);
+#endif
+
+#endif /* _XEN_SYSCTL_H */
diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h
index 17857fb..2a94439 100644
--- a/include/xen/privcmd.h
+++ b/include/xen/privcmd.h
@@ -1,77 +1,3 @@
-/******************************************************************************
- * privcmd.h
- *
- * Interface to /proc/xen/privcmd.
- *
- * Copyright (c) 2003-2005, K A Fraser
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef __LINUX_PUBLIC_PRIVCMD_H__
-#define __LINUX_PUBLIC_PRIVCMD_H__
-
-#include <linux/types.h>
-#include <linux/compiler.h>
-
-typedef unsigned long xen_pfn_t;
-
-struct privcmd_hypercall {
-	__u64 op;
-	__u64 arg[5];
-};
-
-struct privcmd_mmap_entry {
-	__u64 va;
-	__u64 mfn;
-	__u64 npages;
-};
-
-struct privcmd_mmap {
-	int num;
-	domid_t dom; /* target domain */
-	struct privcmd_mmap_entry __user *entry;
-};
-
-struct privcmd_mmapbatch {
-	int num;     /* number of pages to populate */
-	domid_t dom; /* target domain */
-	__u64 addr;  /* virtual address */
-	xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
-};
-
-/*
- * @cmd: IOCTL_PRIVCMD_HYPERCALL
- * @arg: &privcmd_hypercall_t
- * Return: Value returned from execution of the specified hypercall.
- */
-#define IOCTL_PRIVCMD_HYPERCALL					\
-	_IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall))
-#define IOCTL_PRIVCMD_MMAP					\
-	_IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap))
-#define IOCTL_PRIVCMD_MMAPBATCH					\
-	_IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch))
-
-#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
+#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__)
+#include "public/privcmd.h"
+#endif
diff --git a/include/xen/public/Kbuild b/include/xen/public/Kbuild
new file mode 100644
index 0000000..d4f1aa8
--- /dev/null
+++ b/include/xen/public/Kbuild
@@ -0,0 +1,5 @@
+header-y += evtchn.h
+header-y += gntdev.h
+header-y += iomulti.h
+header-y += privcmd.h
+header-y += xenbus.h
diff --git a/include/xen/public/evtchn.h b/include/xen/public/evtchn.h
new file mode 100644
index 0000000..938d4da
--- /dev/null
+++ b/include/xen/public/evtchn.h
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * evtchn.h
+ * 
+ * Interface to /dev/xen/evtchn.
+ * 
+ * Copyright (c) 2003-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_EVTCHN_H__
+#define __LINUX_PUBLIC_EVTCHN_H__
+
+/*
+ * Bind a fresh port to VIRQ @virq.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_VIRQ				\
+	_IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
+struct ioctl_evtchn_bind_virq {
+	unsigned int virq;
+};
+
+/*
+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_INTERDOMAIN			\
+	_IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
+struct ioctl_evtchn_bind_interdomain {
+	unsigned int remote_domain, remote_port;
+};
+
+/*
+ * Allocate a fresh port for binding to @remote_domain.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT			\
+	_IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
+struct ioctl_evtchn_bind_unbound_port {
+	unsigned int remote_domain;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_UNBIND				\
+	_IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
+struct ioctl_evtchn_unbind {
+	unsigned int port;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_NOTIFY				\
+	_IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
+struct ioctl_evtchn_notify {
+	unsigned int port;
+};
+
+/* Clear and reinitialise the event buffer. Clear error condition. */
+#define IOCTL_EVTCHN_RESET				\
+	_IOC(_IOC_NONE, 'E', 5, 0)
+
+#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
diff --git a/include/xen/public/gntdev.h b/include/xen/public/gntdev.h
new file mode 100644
index 0000000..5304bd3
--- /dev/null
+++ b/include/xen/public/gntdev.h
@@ -0,0 +1,150 @@
+/******************************************************************************
+ * gntdev.h
+ * 
+ * Interface to /dev/xen/gntdev.
+ * 
+ * Copyright (c) 2007, D G Murray
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_GNTDEV_H__
+#define __LINUX_PUBLIC_GNTDEV_H__
+
+struct ioctl_gntdev_grant_ref {
+	/* The domain ID of the grant to be mapped. */
+	uint32_t domid;
+	/* The grant reference of the grant to be mapped. */
+	uint32_t ref;
+};
+
+/*
+ * Inserts the grant references into the mapping table of an instance
+ * of gntdev. N.B. This does not perform the mapping, which is deferred
+ * until mmap() is called with @index as the offset.
+ */
+#define IOCTL_GNTDEV_MAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
+struct ioctl_gntdev_map_grant_ref {
+	/* IN parameters */
+	/* The number of grants to be mapped. */
+	uint32_t count;
+	uint32_t pad;
+	/* OUT parameters */
+	/* The offset to be used on a subsequent call to mmap(). */
+	uint64_t index;
+	/* Variable IN parameter. */
+	/* Array of grant references, of size @count. */
+	struct ioctl_gntdev_grant_ref refs[1];
+};
+
+/*
+ * Removes the grant references from the mapping table of an instance of
+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
+ * before this ioctl is called, or an error will result.
+ */
+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
+struct ioctl_gntdev_unmap_grant_ref {
+	/* IN parameters */
+	/* The offset was returned by the corresponding map operation. */
+	uint64_t index;
+	/* The number of pages to be unmapped. */
+	uint32_t count;
+	uint32_t pad;
+};
+
+/*
+ * Returns the offset in the driver's address space that corresponds
+ * to @vaddr. This can be used to perform a munmap(), followed by an
+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
+ * the caller. The number of pages that were allocated at the same time as
+ * @vaddr is returned in @count.
+ *
+ * N.B. Where more than one page has been mapped into a contiguous range, the
+ *      supplied @vaddr must correspond to the start of the range; otherwise
+ *      an error will result. It is only possible to munmap() the entire
+ *      contiguously-allocated range at once, and not any subrange thereof.
+ */
+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
+_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
+struct ioctl_gntdev_get_offset_for_vaddr {
+	/* IN parameters */
+	/* The virtual address of the first mapped page in a range. */
+	uint64_t vaddr;
+	/* OUT parameters */
+	/* The offset that was used in the initial mmap() operation. */
+	uint64_t offset;
+	/* The number of pages mapped in the VM area that begins at @vaddr. */
+	uint32_t count;
+	uint32_t pad;
+};
+
+/*
+ * Sets the maximum number of grants that may mapped at once by this gntdev
+ * instance.
+ *
+ * N.B. This must be called before any other ioctl is performed on the device.
+ */
+#define IOCTL_GNTDEV_SET_MAX_GRANTS \
+_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
+struct ioctl_gntdev_set_max_grants {
+	/* IN parameter */
+	/* The maximum number of grants that may be mapped at once. */
+	uint32_t count;
+};
+
+/*
+ * Sets up an unmap notification within the page, so that the other side can do
+ * cleanup if this side crashes. Required to implement cross-domain robust
+ * mutexes or close notification on communication channels.
+ *
+ * Each mapped page only supports one notification; multiple calls referring to
+ * the same page overwrite the previous notification. You must clear the
+ * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
+ * to occur.
+ */
+#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \
+_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify))
+struct ioctl_gntdev_unmap_notify {
+	/* IN parameters */
+	/* Offset in the file descriptor for a byte within the page (same as
+	 * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
+	 * be cleared. Otherwise, it can be any byte in the page whose
+	 * notification we are adjusting.
+	 */
+	uint64_t index;
+	/* Action(s) to take on unmap */
+	uint32_t action;
+	/* Event channel to notify */
+	uint32_t event_channel_port;
+};
+
+/* Clear (set to zero) the byte specified by index */
+#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
+/* Send an interrupt on the indicated event channel */
+#define UNMAP_NOTIFY_SEND_EVENT 0x2
+
+#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
diff --git a/include/xen/public/iomulti.h b/include/xen/public/iomulti.h
new file mode 100644
index 0000000..ae973f6
--- /dev/null
+++ b/include/xen/public/iomulti.h
@@ -0,0 +1,50 @@
+#ifndef __LINUX_PUBLIC_IOMULTI_H__
+#define __LINUX_PUBLIC_IOMULTI_H__
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+struct pci_iomul_setup {
+	uint16_t	segment;
+	uint8_t		bus;
+	uint8_t		dev;
+	uint8_t		func;
+};
+
+struct pci_iomul_in {
+	uint8_t		bar;
+	uint64_t	offset;
+
+	uint8_t		size;
+	uint32_t	value;
+};
+
+struct pci_iomul_out {
+	uint8_t		bar;
+	uint64_t	offset;
+
+	uint8_t		size;
+	uint32_t	value;
+};
+
+#define PCI_IOMUL_SETUP		_IOW ('P', 0, struct pci_iomul_setup)
+#define PCI_IOMUL_DISABLE_IO	_IO  ('P', 1)
+#define PCI_IOMUL_IN		_IOWR('P', 2, struct pci_iomul_in)
+#define PCI_IOMUL_OUT		_IOW ('P', 3, struct pci_iomul_out)
+
+#endif /* __LINUX_PUBLIC_IOMULTI_H__ */
diff --git a/include/xen/public/privcmd.h b/include/xen/public/privcmd.h
new file mode 100644
index 0000000..dba4e2e
--- /dev/null
+++ b/include/xen/public/privcmd.h
@@ -0,0 +1,86 @@
+/******************************************************************************
+ * privcmd.h
+ * 
+ * Interface to /proc/xen/privcmd.
+ * 
+ * Copyright (c) 2003-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_PRIVCMD_H__
+#define __LINUX_PUBLIC_PRIVCMD_H__
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+typedef struct privcmd_hypercall
+{
+	__u64 op;
+	__u64 arg[5];
+} privcmd_hypercall_t;
+
+typedef struct privcmd_mmap_entry {
+	__u64 va;
+	__u64 mfn;
+	__u64 npages;
+} privcmd_mmap_entry_t; 
+
+typedef struct privcmd_mmap {
+	int num;
+	domid_t dom; /* target domain */
+	privcmd_mmap_entry_t __user *entry;
+} privcmd_mmap_t; 
+
+typedef struct privcmd_mmapbatch {
+	int num;     /* number of pages to populate */
+	domid_t dom; /* target domain */
+	__u64 addr;  /* virtual address */
+	xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
+} privcmd_mmapbatch_t; 
+
+typedef struct privcmd_mmapbatch_v2 {
+	unsigned int num; /* number of pages to populate */
+	domid_t dom;      /* target domain */
+	__u64 addr;       /* virtual address */
+	const xen_pfn_t __user *arr; /* array of mfns */
+	int __user *err;  /* array of error codes */
+} privcmd_mmapbatch_v2_t;
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
+ * @arg: &privcmd_hypercall_t
+ * Return: Value returned from execution of the specified hypercall.
+ */
+#define IOCTL_PRIVCMD_HYPERCALL					\
+	_IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
+#define IOCTL_PRIVCMD_MMAP					\
+	_IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
+#define IOCTL_PRIVCMD_MMAPBATCH					\
+	_IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
+#define IOCTL_PRIVCMD_MMAPBATCH_V2				\
+	_IOC(_IOC_NONE, 'P', 4, sizeof(privcmd_mmapbatch_v2_t))
+
+#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
diff --git a/include/xen/public/xenbus.h b/include/xen/public/xenbus.h
new file mode 100644
index 0000000..fd61373
--- /dev/null
+++ b/include/xen/public/xenbus.h
@@ -0,0 +1,52 @@
+/******************************************************************************
+ * xenbus.h
+ * 
+ * Interface to /proc/xen/xenbus.
+ * 
+ * Copyright (c) 2008, Diego Ongaro <diego.ongaro@citrix.com>
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_XENBUS_H__
+#define __LINUX_PUBLIC_XENBUS_H__
+
+#include <linux/types.h>
+
+typedef struct xenbus_alloc {
+	domid_t dom;
+	__u32 port;
+	__u32 grant_ref;
+} xenbus_alloc_t;
+
+/*
+ * @cmd: IOCTL_XENBUS_ALLOC
+ * @arg: &xenbus_alloc_t
+ * Return: 0, or -1 for error
+ */
+#define IOCTL_XENBUS_ALLOC					\
+	_IOC(_IOC_NONE, 'X', 0, sizeof(xenbus_alloc_t))
+
+#endif /* __LINUX_PUBLIC_XENBUS_H__ */
diff --git a/include/xen/sysctl.h b/include/xen/sysctl.h
new file mode 100644
index 0000000..7fe9250
--- /dev/null
+++ b/include/xen/sysctl.h
@@ -0,0 +1,11 @@
+#ifndef _XEN_SYSCTL_H
+#define _XEN_SYSCTL_H
+
+/* CTL_XEN names: */
+enum
+{
+	CTL_XEN_INDEPENDENT_WALLCLOCK=1,
+	CTL_XEN_PERMITTED_CLOCK_JITTER=2,
+};
+
+#endif /* _XEN_SYSCTL_H */
diff --git a/include/xen/xen.h b/include/xen/xen.h
index a164024..edb2f5a 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -7,8 +7,10 @@ enum xen_domain_type {
 	XEN_HVM_DOMAIN,		/* running in a Xen hvm domain */
 };
 
-#ifdef CONFIG_XEN
+#if defined(CONFIG_PARAVIRT_XEN)
 extern enum xen_domain_type xen_domain_type;
+#elif defined(CONFIG_XEN)
+#define xen_domain_type		XEN_PV_DOMAIN
 #else
 #define xen_domain_type		XEN_NATIVE
 #endif
@@ -25,6 +27,8 @@ extern enum xen_domain_type xen_domain_type;
 
 #define xen_initial_domain()	(xen_pv_domain() && \
 				 xen_start_info->flags & SIF_INITDOMAIN)
+#elif defined(CONFIG_XEN)
+#define xen_initial_domain()	is_initial_xendomain()
 #else  /* !CONFIG_XEN_DOM0 */
 #define xen_initial_domain()	(0)
 #endif	/* CONFIG_XEN_DOM0 */
diff --git a/include/xen/xen_proc.h b/include/xen/xen_proc.h
new file mode 100644
index 0000000..44af17c
--- /dev/null
+++ b/include/xen/xen_proc.h
@@ -0,0 +1,12 @@
+
+#ifndef __ASM_XEN_PROC_H__
+#define __ASM_XEN_PROC_H__
+
+#include <linux/proc_fs.h>
+
+extern struct proc_dir_entry *create_xen_proc_entry(
+	const char *name, mode_t mode);
+extern void remove_xen_proc_entry(
+	const char *name);
+
+#endif /* __ASM_XEN_PROC_H__ */
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index e8c599b..8a13ea6 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -41,6 +41,8 @@
 #include <linux/completion.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/version.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/grant_table.h>
 #include <xen/interface/io/xenbus.h>
@@ -57,8 +59,21 @@ struct xenbus_watch
 	/* Callback (executed in a process context with no locks held). */
 	void (*callback)(struct xenbus_watch *,
 			 const char **vec, unsigned int len);
+
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+	/* See XBWF_ definitions below. */
+	unsigned long flags;
+#endif
 };
 
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/*
+ * Execute callback in its own kthread. Useful if the callback is long
+ * running or heavily serialised, to avoid taking out the main xenwatch thread
+ * for a long period of time (or even unwittingly causing a deadlock).
+ */
+#define XBWF_new_thread	1
+#endif
 
 /* A xenbus device. */
 struct xenbus_device {
@@ -92,6 +107,9 @@ struct xenbus_driver {
 				 enum xenbus_state backend_state);
 	int (*remove)(struct xenbus_device *dev);
 	int (*suspend)(struct xenbus_device *dev);
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+	int (*suspend_cancel)(struct xenbus_device *dev);
+#endif
 	int (*resume)(struct xenbus_device *dev);
 	int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *);
 	struct device_driver driver;
@@ -99,10 +117,17 @@ struct xenbus_driver {
 	int (*is_ready)(struct xenbus_device *dev);
 };
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+# define XENBUS_DRIVER_SET_OWNER(mod) .driver.owner = mod,
+#else
+# define XENBUS_DRIVER_SET_OWNER(mod)
+#endif
+
 #define DEFINE_XENBUS_DRIVER(var, drvname, methods...)		\
 struct xenbus_driver var ## _driver = {				\
 	.driver.name = drvname + 0 ?: var ## _ids->devicetype,	\
-	.driver.owner = THIS_MODULE,				\
+	.driver.mod_name = KBUILD_MODNAME,			\
+	XENBUS_DRIVER_SET_OWNER(THIS_MODULE)			\
 	.ids = var ## _ids, ## methods				\
 }
 
@@ -111,9 +136,8 @@ static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
 	return container_of(drv, struct xenbus_driver, driver);
 }
 
-int __must_check xenbus_register_frontend(struct xenbus_driver *);
-int __must_check xenbus_register_backend(struct xenbus_driver *);
-
+int __must_check xenbus_register_frontend(struct xenbus_driver *drv);
+int __must_check xenbus_register_backend(struct xenbus_driver *drv);
 void xenbus_unregister_driver(struct xenbus_driver *drv);
 
 struct xenbus_transaction
@@ -153,7 +177,6 @@ int xenbus_printf(struct xenbus_transaction t,
 int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
 
 /* notifer routines for when the xenstore comes up */
-extern int xenstored_ready;
 int register_xenstore_notifier(struct notifier_block *nb);
 void unregister_xenstore_notifier(struct notifier_block *nb);
 
@@ -167,11 +190,11 @@ void xs_suspend_cancel(void);
 void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
 
 struct work_struct;
+void xenbus_probe(struct work_struct *);
 
 /* Prepare for domain suspend: then resume or cancel the suspend. */
 void xenbus_suspend(void);
 void xenbus_resume(void);
-void xenbus_probe(struct work_struct *);
 void xenbus_suspend_cancel(void);
 
 #define XENBUS_IS_ERR_READ(str) ({			\
@@ -184,40 +207,135 @@ void xenbus_suspend_cancel(void);
 
 #define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
 
+
+/**
+ * Register a watch on the given path, using the given xenbus_watch structure
+ * for storage, and the given callback function as the callback.  Return 0 on
+ * success, or -errno on error.  On success, the given path will be saved as
+ * watch->node, and remains the caller's to free.  On error, watch->node will
+ * be NULL, the device will switch to XenbusStateClosing, and the error will
+ * be saved in the store.
+ */
 int xenbus_watch_path(struct xenbus_device *dev, const char *path,
 		      struct xenbus_watch *watch,
 		      void (*callback)(struct xenbus_watch *,
 				       const char **, unsigned int));
+
+
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/**
+ * Register a watch on the given path/path2, using the given xenbus_watch
+ * structure for storage, and the given callback function as the callback.
+ * Return 0 on success, or -errno on error.  On success, the watched path
+ * (path/path2) will be saved as watch->node, and becomes the caller's to
+ * kfree().  On error, watch->node will be NULL, so the caller has nothing to
+ * free, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
+		       const char *path2, struct xenbus_watch *watch,
+		       void (*callback)(struct xenbus_watch *,
+					const char **, unsigned int));
+#else
 __printf(4, 5)
 int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
 			 void (*callback)(struct xenbus_watch *,
 					  const char **, unsigned int),
 			 const char *pathfmt, ...);
+#endif
 
+/**
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Return 0 on success, or -errno on error.  On error, the device will switch
+ * to XenbusStateClosing, and the error will be saved in the store.
+ */
 int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
+
+/**
+ * Grant access to the given ring_mfn to the peer of the given device.  Return
+ * 0 on success, or -errno on error.  On error, the device will switch to
+ * XenbusStateClosing, and the error will be saved in the store.
+ */
 int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
+
+/**
+ * Map a page of memory into this domain from another domain's grant table.
+ * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
+ * page to that address, and sets *vaddr to that address.
+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
+ * or -ENOMEM on error. If an error is returned, device will switch to
+ * XenbusStateClosing and the error message will be saved in XenStore.
+ */
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev,
+					 grant_ref_t ref);
+#else
 int xenbus_map_ring_valloc(struct xenbus_device *dev,
-			   int gnt_ref, void **vaddr);
-int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+			   grant_ref_t gnt_ref, void **vaddr);
+#endif
+int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t gnt_ref,
 			   grant_handle_t *handle, void *vaddr);
 
+/**
+ * Unmap a page of memory in this domain that was imported from another domain
+ * and free the virtual address space.
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *);
+#else
 int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
+#endif
 int xenbus_unmap_ring(struct xenbus_device *dev,
 		      grant_handle_t handle, void *vaddr);
 
+/**
+ * Allocate an event channel for the given xenbus_device, assigning the newly
+ * created local port to *port.  Return 0 on success, or -errno on error.  On
+ * error, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
 int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
-int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
+
+
+/**
+ * Free an existing event channel. Returns 0 on success or -errno on error.
+ */
 int xenbus_free_evtchn(struct xenbus_device *dev, int port);
 
+
+/**
+ * Return the state of the driver rooted at the given store path, or
+ * XenbusStateUnknown if no state can be read.
+ */
 enum xenbus_state xenbus_read_driver_state(const char *path);
 
-__printf(3, 4)
-void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...);
-__printf(3, 4)
-void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...);
+
+/***
+ * Report the given negative errno into the store, along with the given
+ * formatted message.
+ */
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
+		      ...) __printf(3, 4);
+
+/***
+ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
+ * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
+ * closedown of this driver and its peer.
+ */
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
+		      ...) __printf(3, 4);
+
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+int xenbus_dev_init(void);
+#endif
 
 const char *xenbus_strstate(enum xenbus_state state);
 int xenbus_dev_is_online(struct xenbus_device *dev);
 int xenbus_frontend_closed(struct xenbus_device *dev);
 
+int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *));
+int xenbus_for_each_frontend(void *arg, int (*fn)(struct device *, void *));
+
 #endif /* _XEN_XENBUS_H */
diff --git a/include/xen/xencons.h b/include/xen/xencons.h
new file mode 100644
index 0000000..c5a55d6
--- /dev/null
+++ b/include/xen/xencons.h
@@ -0,0 +1,17 @@
+#ifndef __ASM_XENCONS_H__
+#define __ASM_XENCONS_H__
+
+struct dom0_vga_console_info;
+void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t);
+
+void xencons_force_flush(void);
+void xencons_resume(void);
+
+/* Interrupt work hooks. Receive data, or kick data out. */
+void xencons_rx(char *buf, unsigned len);
+void xencons_tx(void);
+
+int xencons_ring_init(void);
+int xencons_ring_send(const char *data, unsigned len);
+
+#endif /* __ASM_XENCONS_H__ */
diff --git a/include/xen/xenoprof.h b/include/xen/xenoprof.h
new file mode 100644
index 0000000..4c3ab0f
--- /dev/null
+++ b/include/xen/xenoprof.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+ * xen/xenoprof.h
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#ifndef __XEN_XENOPROF_H__
+#define __XEN_XENOPROF_H__
+#ifdef CONFIG_XEN
+
+#include <asm/xenoprof.h>
+
+struct oprofile_operations;
+int xenoprofile_init(struct oprofile_operations * ops);
+void xenoprofile_exit(void);
+
+struct xenoprof_shared_buffer {
+	char					*buffer;
+	struct xenoprof_arch_shared_buffer	arch;
+};
+#else
+#define xenoprofile_init(ops)	(-ENOSYS)
+#define xenoprofile_exit()	do { } while (0)
+
+#endif /* CONFIG_XEN */
+#endif /* __XEN_XENOPROF_H__ */
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c1a7fb5..c5d1047 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY
 
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
+	depends on !XEN
 	select PREEMPT_COUNT
 	help
 	  This option reduces the latency of the kernel by making
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd60..4c8ce39 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -290,7 +290,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
 		 */
 		if (time_after(jiffies, desc->last_unhandled + HZ/10))
 			desc->irqs_unhandled = 1;
-		else
+		else if (!irq_ignore_unhandled(irq))
 			desc->irqs_unhandled++;
 		desc->last_unhandled = jiffies;
 	}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b08867..b19e3af 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,12 +40,18 @@
 #include <asm/system.h>
 #include <asm/sections.h>
 
+#ifndef CONFIG_XEN
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
+#endif
 
 /* vmcoreinfo stuff */
 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+u32
+#if defined(CONFIG_XEN) && defined(CONFIG_X86)
+__page_aligned_bss
+#endif
+vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 
@@ -356,13 +362,26 @@ static int kimage_is_destination_range(struct kimage *image,
 	return 0;
 }
 
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, unsigned long limit)
 {
 	struct page *pages;
 
 	pages = alloc_pages(gfp_mask, order);
 	if (pages) {
 		unsigned int count, i;
+#ifdef CONFIG_XEN
+		int address_bits;
+
+		if (limit == ~0UL)
+			address_bits = BITS_PER_LONG;
+		else
+			address_bits = ilog2(limit);
+
+		if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) {
+			__free_pages(pages, order);
+			return NULL;
+		}
+#endif
 		pages->mapping = NULL;
 		set_page_private(pages, order);
 		count = 1 << order;
@@ -426,10 +445,10 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	do {
 		unsigned long pfn, epfn, addr, eaddr;
 
-		pages = kimage_alloc_pages(GFP_KERNEL, order);
+		pages = kimage_alloc_pages(GFP_KERNEL, order, KEXEC_CONTROL_MEMORY_LIMIT);
 		if (!pages)
 			break;
-		pfn   = page_to_pfn(pages);
+		pfn   = kexec_page_to_pfn(pages);
 		epfn  = pfn + count;
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
@@ -463,6 +482,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	return pages;
 }
 
+#ifndef CONFIG_XEN
 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 						      unsigned int order)
 {
@@ -516,7 +536,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 		}
 		/* If I don't overlap any segments I have found my hole! */
 		if (i == image->nr_segments) {
-			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
 			break;
 		}
 	}
@@ -543,6 +563,13 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
 
 	return pages;
 }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+					 unsigned int order)
+{
+	return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
@@ -558,7 +585,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 			return -ENOMEM;
 
 		ind_page = page_address(page);
-		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		*image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
 		image->last_entry = ind_page +
 				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -617,13 +644,13 @@ static void kimage_terminate(struct kimage *image)
 #define for_each_kimage_entry(image, ptr, entry) \
 	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 		ptr = (entry & IND_INDIRECTION)? \
-			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+			kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
 
 static void kimage_free_entry(kimage_entry_t entry)
 {
 	struct page *page;
 
-	page = pfn_to_page(entry >> PAGE_SHIFT);
+	page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
 	kimage_free_pages(page);
 }
 
@@ -635,6 +662,10 @@ static void kimage_free(struct kimage *image)
 	if (!image)
 		return;
 
+#ifdef CONFIG_XEN
+	xen_machine_kexec_unload(image);
+#endif
+
 	kimage_free_extra_pages(image);
 	for_each_kimage_entry(image, ptr, entry) {
 		if (entry & IND_INDIRECTION) {
@@ -710,7 +741,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 	 * have a match.
 	 */
 	list_for_each_entry(page, &image->dest_pages, lru) {
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 		if (addr == destination) {
 			list_del(&page->lru);
 			return page;
@@ -721,16 +752,16 @@ static struct page *kimage_alloc_page(struct kimage *image,
 		kimage_entry_t *old;
 
 		/* Allocate a page, if we run out of memory give up */
-		page = kimage_alloc_pages(gfp_mask, 0);
+		page = kimage_alloc_pages(gfp_mask, 0, KEXEC_SOURCE_MEMORY_LIMIT);
 		if (!page)
 			return NULL;
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) >
+		if (kexec_page_to_pfn(page) >
 				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unuseable_pages);
 			continue;
 		}
-		addr = page_to_pfn(page) << PAGE_SHIFT;
+		addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
 
 		/* If it is the destination page we want use it */
 		if (addr == destination)
@@ -753,7 +784,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
 			struct page *old_page;
 
 			old_addr = *old & PAGE_MASK;
-			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
 			copy_highpage(page, old_page);
 			*old = addr | (*old & ~PAGE_MASK);
 
@@ -809,7 +840,7 @@ static int kimage_load_normal_segment(struct kimage *image,
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page)
+		result = kimage_add_page(image, kexec_page_to_pfn(page)
 								<< PAGE_SHIFT);
 		if (result < 0)
 			goto out;
@@ -841,6 +872,7 @@ out:
 	return result;
 }
 
+#ifndef CONFIG_XEN
 static int kimage_load_crash_segment(struct kimage *image,
 					struct kexec_segment *segment)
 {
@@ -863,7 +895,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 		char *ptr;
 		size_t uchunk, mchunk;
 
-		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
 		if (!page) {
 			result  = -ENOMEM;
 			goto out;
@@ -912,6 +944,13 @@ static int kimage_load_segment(struct kimage *image,
 
 	return result;
 }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+				struct kexec_segment *segment)
+{
+	return kimage_load_normal_segment(image, segment);
+}
+#endif
 
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
@@ -1018,6 +1057,13 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		if (flags & KEXEC_ON_CRASH)
 			crash_unmap_reserved_pages();
 	}
+#ifdef CONFIG_XEN
+	if (image) {
+		result = xen_machine_kexec_load(image);
+		if (result)
+			goto out;
+	}
+#endif
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
 
@@ -1112,6 +1158,7 @@ size_t crash_get_memory_size(void)
 	return size;
 }
 
+#ifndef CONFIG_XEN
 void __weak crash_free_reserved_phys_range(unsigned long begin,
 					   unsigned long end)
 {
@@ -1175,6 +1222,7 @@ unlock:
 	mutex_unlock(&kexec_mutex);
 	return ret;
 }
+#endif /* !CONFIG_XEN */
 
 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
 			    size_t data_len)
@@ -1204,6 +1252,7 @@ static void final_note(u32 *buf)
 	memcpy(buf, &note, sizeof(note));
 }
 
+#ifndef CONFIG_XEN
 void crash_save_cpu(struct pt_regs *regs, int cpu)
 {
 	struct elf_prstatus prstatus;
@@ -1229,9 +1278,11 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
 		      	      &prstatus, sizeof(prstatus));
 	final_note(buf);
 }
+#endif
 
 static int __init crash_notes_memory_init(void)
 {
+#ifndef CONFIG_XEN
 	/* Allocate memory for saving cpu registers. */
 	crash_notes = alloc_percpu(note_buf_t);
 	if (!crash_notes) {
@@ -1239,11 +1290,13 @@ static int __init crash_notes_memory_init(void)
 		" states failed\n");
 		return -ENOMEM;
 	}
+#endif
 	return 0;
 }
 module_init(crash_notes_memory_init)
 
 
+#ifndef CONFIG_XEN
 /*
  * parsing the "crashkernel" commandline
  *
@@ -1406,7 +1459,7 @@ int __init parse_crashkernel(char 		 *cmdline,
 
 	return 0;
 }
-
+#endif
 
 static void update_vmcoreinfo_note(void)
 {
@@ -1462,7 +1515,18 @@ static int __init crash_save_vmcoreinfo_init(void)
 
 	VMCOREINFO_SYMBOL(init_uts_ns);
 	VMCOREINFO_SYMBOL(node_online_map);
+#ifndef CONFIG_X86_XEN
 	VMCOREINFO_SYMBOL(swapper_pg_dir);
+#else
+/*
+ * Since for x86-32 Xen swapper_pg_dir is a pointer rather than an array,
+ * make the value stored consistent with native (i.e. the base address of
+ * the page directory).
+ */
+# define swapper_pg_dir *swapper_pg_dir
+	VMCOREINFO_SYMBOL(swapper_pg_dir);
+# undef swapper_pg_dir
+#endif
 	VMCOREINFO_SYMBOL(_stext);
 	VMCOREINFO_SYMBOL(vmlist);
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index ec8f445..cace878 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -107,6 +107,7 @@ static ssize_t kexec_crash_size_show(struct kobject *kobj,
 {
 	return sprintf(buf, "%zu\n", crash_get_memory_size());
 }
+#ifndef CONFIG_XEN
 static ssize_t kexec_crash_size_store(struct kobject *kobj,
 				   struct kobj_attribute *attr,
 				   const char *buf, size_t count)
@@ -121,6 +122,9 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
 	return ret < 0 ? ret : count;
 }
 KERNEL_ATTR_RW(kexec_crash_size);
+#else
+KERNEL_ATTR_RO(kexec_crash_size);
+#endif
 
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index deb5461..ca97aa1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -139,7 +139,7 @@ config PM_ADVANCED_DEBUG
 
 config PM_TEST_SUSPEND
 	bool "Test suspend/resume and wakealarm during bootup"
-	depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+	depends on SUSPEND && PM_DEBUG && RTC_CLASS=y && !XEN_UNPRIVILEGED_GUEST
 	---help---
 	This option will let you suspend your machine during bootup, and
 	make it wake up a few seconds later using an RTC wakeup alarm.
@@ -170,7 +170,7 @@ config PM_TRACE
 config PM_TRACE_RTC
 	bool "Suspend/resume event tracing"
 	depends on CAN_PM_TRACE
-	depends on X86
+	depends on X86 && !XEN_UNPRIVILEGED_GUEST
 	select PM_TRACE
 	---help---
 	This enables some cheesy code to save the last PM event point in the
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5255c9d..819125a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2625,6 +2625,48 @@ static inline void task_group_account_field(struct task_struct *p, int index,
 }
 
 
+#if !defined(CONFIG_XEN) || defined(CONFIG_VIRT_CPU_ACCOUNTING)
+# define cputime_to_u64(t) ((__force u64)(t))
+#else
+# include <linux/syscore_ops.h>
+# define NS_PER_TICK (1000000000 / HZ)
+
+static DEFINE_PER_CPU(u64, steal_snapshot);
+static DEFINE_PER_CPU(unsigned int, steal_residual);
+
+static u64 cputime_to_u64(cputime_t t)
+{
+	u64 s = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+	unsigned long adj = div_u64_rem(s - __this_cpu_read(steal_snapshot)
+					  + __this_cpu_read(steal_residual),
+					NS_PER_TICK,
+					&__get_cpu_var(steal_residual));
+
+	__this_cpu_write(steal_snapshot, s);
+	if (t < jiffies_to_cputime(adj))
+		return 0;
+
+	return (__force u64)(t - jiffies_to_cputime(adj));
+}
+
+static void steal_resume(void)
+{
+	cputime_to_u64(((cputime_t)1 << (BITS_PER_LONG * sizeof(cputime_t)
+					 / sizeof(long) - 1)) - 1);
+}
+
+static struct syscore_ops steal_syscore_ops = {
+	.resume	= steal_resume,
+};
+
+static int __init steal_register(void)
+{
+	register_syscore_ops(&steal_syscore_ops);
+	return 0;
+}
+core_initcall(steal_register);
+#endif
+
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -2644,7 +2686,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
 	/* Add user time to cpustat. */
-	task_group_account_field(p, index, (__force u64) cputime);
+	task_group_account_field(p, index, cputime_to_u64(cputime));
 
 	/* Account for user time used */
 	acct_update_integrals(p);
@@ -2694,7 +2736,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
 	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
-	task_group_account_field(p, index, (__force u64) cputime);
+	task_group_account_field(p, index, cputime_to_u64(cputime));
 
 	/* Account for system time used */
 	acct_update_integrals(p);
@@ -2748,9 +2790,9 @@ void account_idle_time(cputime_t cputime)
 	struct rq *rq = this_rq();
 
 	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+		cpustat[CPUTIME_IOWAIT] += cputime_to_u64(cputime);
 	else
-		cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+		cpustat[CPUTIME_IDLE] += cputime_to_u64(cputime);
 }
 
 static __always_inline bool steal_account_process_tick(void)
@@ -2806,9 +2848,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		return;
 
 	if (irqtime_account_hi_update()) {
-		cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+		cpustat[CPUTIME_IRQ] += cputime_to_u64(cputime_one_jiffy);
 	} else if (irqtime_account_si_update()) {
-		cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+		cpustat[CPUTIME_SOFTIRQ] += cputime_to_u64(cputime_one_jiffy);
 	} else if (this_cpu_ksoftirqd() == p) {
 		/*
 		 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -3248,6 +3290,11 @@ asmlinkage void __sched schedule(void)
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+#include <asm/mutex.h>
+
+#ifndef arch_cpu_is_running
+#define arch_cpu_is_running(cpu) true
+#endif
 
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
@@ -3262,7 +3309,8 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 	 */
 	barrier();
 
-	return owner->on_cpu;
+	return owner->on_cpu
+	       && arch_cpu_is_running(task_thread_info(owner)->cpu);
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d1e07d0..6162ff9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -875,7 +875,7 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#if	defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
+#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) && !defined(CONFIG_ACPI_PV_SLEEP)
 	{
 		.procname	= "acpi_video_flags",
 		.data		= &acpi_realmode_flags,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8829f33..06fe210 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -873,6 +873,15 @@ static const struct bin_table bin_bus_table[] = {
 };
 
 
+#ifdef CONFIG_XEN
+#include <xen/sysctl.h>
+static const struct bin_table bin_xen_table[] = {
+	{ CTL_INT,	CTL_XEN_INDEPENDENT_WALLCLOCK,	"independent_wallclock" },
+	{ CTL_ULONG,	CTL_XEN_PERMITTED_CLOCK_JITTER,	"permitted_clock_jitter" },
+	{}
+};
+#endif
+
 static const struct bin_table bin_s390dbf_table[] = {
 	{ CTL_INT,	5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
 	{ CTL_INT,	5679 /* CTL_S390DBF_ACTIVE */,	  "debug_active" },
@@ -912,6 +921,9 @@ static const struct bin_table bin_root_table[] = {
 	{ CTL_DIR,	CTL_BUS,	"bus",		bin_bus_table },
 	{ CTL_DIR,	CTL_ABI,	"abi" },
 	/* CTL_CPU not used */
+#ifdef CONFIG_XEN
+	{ CTL_DIR,	CTL_XEN,	"xen",		bin_xen_table },
+#endif
 	/* CTL_ARLAN "arlan" no longer used */
 	{ CTL_DIR,	CTL_S390DBF,	"s390dbf",	bin_s390dbf_table },
 	{ CTL_DIR,	CTL_SUNRPC,	"sunrpc",	bin_sunrpc_table },
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0c63581..7ac43f0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -20,6 +20,9 @@
 #include <linux/time.h>
 #include <linux/tick.h>
 #include <linux/stop_machine.h>
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#include <asm/time.h>
+#endif
 
 /* Structure holding internal timekeeping values. */
 struct timekeeper {
@@ -380,6 +383,9 @@ int do_settimeofday(const struct timespec *tv)
 
 	update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
 				timekeeper.mult);
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+	xen_update_wallclock(tv);
+#endif
 
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 
diff --git a/lib/swiotlb-xen.c b/lib/swiotlb-xen.c
new file mode 100644
index 0000000..b91f6c6
--- /dev/null
+++ b/lib/swiotlb-xen.c
@@ -0,0 +1,808 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * This implementation is a fallback for platforms that do not support
+ * I/O TLBs (aka DMA address translation hardware).
+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
+ * 08/12/11 beckyb	Add highmem support
+ */
+
+#include <linux/cache.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/swiotlb.h>
+#include <linux/pfn.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/iommu-helper.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+
+#include <asm/io.h>
+#include <asm/pci.h>
+#include <asm/dma.h>
+#include <asm/uaccess.h>
+#include <xen/gnttab.h>
+#include <xen/interface/memory.h>
+#include <asm/gnttab_dma.h>
+
+#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
+
+int swiotlb;
+int swiotlb_force;
+
+/*
+ * Used to do a quick range check in swiotlb_tbl_unmap_single and
+ * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+static char *io_tlb_start, *io_tlb_end;
+
+/*
+ * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
+ * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
+ */
+static unsigned long io_tlb_nslabs;
+
+/*
+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
+ */
+static unsigned long io_tlb_overflow = 32*1024;
+
+static void *io_tlb_overflow_buffer;
+
+/*
+ * This is a free list describing the number of free entries available from
+ * each index
+ */
+static unsigned int *io_tlb_list;
+static unsigned int io_tlb_index;
+
+/*
+ * We need to save away the original address corresponding to a mapped entry
+ * for the sync operations.
+ */
+static phys_addr_t *io_tlb_orig_addr;
+
+/*
+ * Protect the above data structures in the map and unmap calls
+ */
+static DEFINE_SPINLOCK(io_tlb_lock);
+
+static unsigned int dma_bits;
+static unsigned int __initdata max_dma_bits = 32;
+static int __init
+setup_dma_bits(char *str)
+{
+	max_dma_bits = simple_strtoul(str, NULL, 0);
+	return 0;
+}
+__setup("dma_bits=", setup_dma_bits);
+
+static int __init
+setup_io_tlb_npages(char *str)
+{
+	/* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
+	if (isdigit(*str)) {
+		io_tlb_nslabs = simple_strtoul(str, &str, 0) <<
+			(20 - IO_TLB_SHIFT);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+	if (*str == ',')
+		++str;
+	/*
+         * NB. 'force' enables the swiotlb, but doesn't force its use for
+         * every DMA like it does on native Linux. 'off' forcibly disables
+         * use of the swiotlb.
+         */
+	if (!strcmp(str, "force"))
+		swiotlb_force = 1;
+	else if (!strcmp(str, "off"))
+		swiotlb_force = -1;
+
+	return 1;
+}
+__setup("swiotlb=", setup_io_tlb_npages);
+/* make io_tlb_overflow tunable too? */
+
+unsigned long swiotlb_nr_tbl(void)
+{
+	return io_tlb_nslabs;
+}
+EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
+/* Note that this doesn't work with highmem page */
+static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
+				      volatile void *address)
+{
+	return phys_to_dma(hwdev, virt_to_phys(address));
+}
+
+void swiotlb_print_info(void)
+{
+	unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+	printk(KERN_INFO "Software IO TLB enabled: \n"
+	       " Aperture:     %lu megabytes\n"
+	       " Address size: %u bits\n"
+	       " Kernel range: %p - %p\n",
+	       bytes >> 20, dma_bits,
+	       io_tlb_start, io_tlb_end);
+}
+
+void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+{
+	unsigned long i, bytes;
+	int rc;
+
+	bytes = nslabs << IO_TLB_SHIFT;
+
+	io_tlb_nslabs = nslabs;
+	io_tlb_start = tlb;
+	dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
+	for (nslabs = 0; nslabs < io_tlb_nslabs; nslabs += IO_TLB_SEGSIZE) {
+		do {
+			rc = xen_create_contiguous_region(
+				(unsigned long)io_tlb_start + (nslabs << IO_TLB_SHIFT),
+				get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
+				dma_bits);
+		} while (rc && dma_bits++ < max_dma_bits);
+		if (rc) {
+			if (nslabs == 0)
+				panic("No suitable physical memory available for SWIOTLB buffer!\n"
+				      "Use dom0_mem Xen boot parameter to reserve\n"
+				      "some DMA memory (e.g., dom0_mem=-128M).\n");
+			io_tlb_nslabs = nslabs;
+			i = nslabs << IO_TLB_SHIFT;
+			free_bootmem(__pa(io_tlb_start + i), bytes - i);
+			bytes = i;
+			for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
+				unsigned int bits = fls64(virt_to_bus(io_tlb_start + i - 1));
+
+				if (bits > dma_bits)
+					dma_bits = bits;
+			}
+			break;
+		}
+	}
+	io_tlb_end = io_tlb_start + bytes;
+
+	/*
+	 * Allocate and initialize the free list array.  This array is used
+	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
+	 */
+	io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
+	for (i = 0; i < io_tlb_nslabs; i++)
+ 		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+	io_tlb_index = 0;
+	io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+
+	/*
+	 * Get the overflow emergency buffer
+	 */
+	io_tlb_overflow_buffer = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_overflow));
+	if (!io_tlb_overflow_buffer)
+		panic("Cannot allocate SWIOTLB overflow buffer!\n");
+
+	do {
+		rc = xen_create_contiguous_region(
+			(unsigned long)io_tlb_overflow_buffer,
+			get_order(io_tlb_overflow),
+			dma_bits);
+	} while (rc && dma_bits++ < max_dma_bits);
+	if (rc)
+		panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
+	if (verbose)
+		swiotlb_print_info();
+}
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void __init
+swiotlb_init_with_default_size(size_t default_size, int verbose)
+{
+	unsigned long bytes;
+
+	if (!io_tlb_nslabs) {
+		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+
+	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+	/*
+	 * Get IO TLB memory from the low pages
+	 */
+	io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes));
+	if (!io_tlb_start)
+		panic("Cannot allocate SWIOTLB buffer");
+
+	swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose);
+}
+
+void __init
+swiotlb_init(int verbose)
+{
+	unsigned long ram_end;
+	size_t defsz = 64 << 20; /* 64MB default size */
+
+	if (swiotlb_force == 1) {
+		swiotlb = 1;
+	} else if ((swiotlb_force != -1) &&
+		   is_running_on_xen() &&
+		   is_initial_xendomain()) {
+		/* Domain 0 always has a swiotlb. */
+		ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+		if (ram_end <= 0x1ffff)
+			defsz = 2 << 20; /* 2MB on <512MB systems. */
+		else if (ram_end <= 0x3ffff)
+			defsz = 4 << 20; /* 4MB on <1GB systems. */
+		else if (ram_end <= 0x7ffff)
+			defsz = 8 << 20; /* 8MB on <2GB systems. */
+		swiotlb = 1;
+	}
+
+	if (swiotlb)
+		swiotlb_init_with_default_size(defsz, verbose);
+	else
+		printk(KERN_INFO "Software IO TLB disabled\n");
+}
+
+static inline int range_needs_mapping(phys_addr_t pa, size_t size)
+{
+	return range_straddles_page_boundary(pa, size);
+}
+
+static int is_swiotlb_buffer(dma_addr_t addr)
+{
+	unsigned long pfn = mfn_to_local_pfn(PFN_DOWN(addr));
+	phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
+
+	return paddr >= virt_to_phys(io_tlb_start) &&
+		paddr < virt_to_phys(io_tlb_end);
+}
+
+/*
+ * Bounce: copy the swiotlb buffer back to the original dma location
+ *
+ * We use __copy_to_user_inatomic to transfer to the host buffer because the
+ * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
+ * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
+ * unnecessary copy from the aperture to the host buffer, and a page fault.
+ */
+void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+		    enum dma_data_direction dir)
+{
+	unsigned long pfn = PFN_DOWN(phys);
+
+	if (PageHighMem(pfn_to_page(pfn))) {
+		/* The buffer does not have a mapping.  Map it in and copy */
+		unsigned int offset = phys & ~PAGE_MASK;
+		char *buffer;
+		unsigned int sz = 0;
+		unsigned long flags;
+
+		while (size) {
+			sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+			local_irq_save(flags);
+			buffer = kmap_atomic(pfn_to_page(pfn),
+					     KM_BOUNCE_READ);
+			if (dir == DMA_TO_DEVICE)
+				memcpy(dma_addr, buffer + offset, sz);
+			else if (__copy_to_user_inatomic(buffer + offset,
+							 dma_addr, sz))
+				/* inaccessible */;
+			kunmap_atomic(buffer, KM_BOUNCE_READ);
+			local_irq_restore(flags);
+
+			size -= sz;
+			pfn++;
+			dma_addr += sz;
+			offset = 0;
+		}
+	} else {
+		if (dir == DMA_TO_DEVICE)
+			memcpy(dma_addr, phys_to_virt(phys), size);
+		else if (__copy_to_user_inatomic(phys_to_virt(phys),
+						 dma_addr, size))
+			/* inaccessible */;
+	}
+}
+EXPORT_SYMBOL_GPL(swiotlb_bounce);
+
+void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
+			     phys_addr_t phys, size_t size,
+			     enum dma_data_direction dir)
+{
+	unsigned long flags;
+	char *dma_addr;
+	unsigned int nslots, stride, index, wrap;
+	int i;
+	unsigned long mask;
+	unsigned long offset_slots;
+	unsigned long max_slots;
+
+	mask = dma_get_seg_boundary(hwdev);
+	offset_slots = -IO_TLB_SEGSIZE;
+
+	/*
+ 	 * Carefully handle integer overflow which can occur when mask == ~0UL.
+ 	 */
+	max_slots = mask + 1
+		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+
+	/*
+	 * For mappings greater than a page, we limit the stride (and
+	 * hence alignment) to a page size.
+	 */
+	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	if (size > PAGE_SIZE)
+		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+	else
+		stride = 1;
+
+	BUG_ON(!nslots);
+
+	/*
+	 * Find suitable number of IO TLB entries size that will fit this
+	 * request and allocate a buffer from that IO TLB pool.
+	 */
+	spin_lock_irqsave(&io_tlb_lock, flags);
+	index = ALIGN(io_tlb_index, stride);
+	if (index >= io_tlb_nslabs)
+		index = 0;
+	wrap = index;
+
+	do {
+		while (iommu_is_span_boundary(index, nslots, offset_slots,
+					      max_slots)) {
+			index += stride;
+			if (index >= io_tlb_nslabs)
+				index = 0;
+			if (index == wrap)
+				goto not_found;
+		}
+
+		/*
+		 * If we find a slot that indicates we have 'nslots' number of
+		 * contiguous buffers, we allocate the buffers from that slot
+		 * and mark the entries as '0' indicating unavailable.
+		 */
+		if (io_tlb_list[index] >= nslots) {
+			int count = 0;
+
+			for (i = index; i < (int) (index + nslots); i++)
+				io_tlb_list[i] = 0;
+			for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+				io_tlb_list[i] = ++count;
+			dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+
+			/*
+			 * Update the indices to avoid searching in the next
+			 * round.
+			 */
+			io_tlb_index = ((index + nslots) < io_tlb_nslabs
+					? (index + nslots) : 0);
+
+			goto found;
+		}
+		index += stride;
+		if (index >= io_tlb_nslabs)
+			index = 0;
+	} while (index != wrap);
+
+not_found:
+	spin_unlock_irqrestore(&io_tlb_lock, flags);
+	return NULL;
+found:
+	spin_unlock_irqrestore(&io_tlb_lock, flags);
+
+	/*
+	 * Save away the mapping from the original address to the DMA address.
+	 * This is needed when we sync the memory.  Then we sync the buffer if
+	 * needed.
+	 */
+	for (i = 0; i < nslots; i++)
+		io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
+	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+		swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+
+	return dma_addr;
+}
+EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
+
+/*
+ * Allocates bounce buffer and returns its kernel virtual address.
+ */
+
+static void *
+map_single(struct device *hwdev, phys_addr_t phys, size_t size,
+	   enum dma_data_direction dir)
+{
+	dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
+
+	return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir);
+}
+
+/*
+ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ */
+void
+swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
+			enum dma_data_direction dir)
+{
+	unsigned long flags;
+	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	phys_addr_t phys = io_tlb_orig_addr[index];
+
+	/*
+	 * First, sync the memory before unmapping the entry
+	 */
+	if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+		swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+
+	/*
+	 * Return the buffer to the free list by setting the corresponding
+	 * entries to indicate the number of contiguous entries available.
+	 * While returning the entries to the free list, we merge the entries
+	 * with slots below and above the pool being returned.
+	 */
+	spin_lock_irqsave(&io_tlb_lock, flags);
+	{
+		count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+			 io_tlb_list[index + nslots] : 0);
+		/*
+		 * Step 1: return the slots to the free list, merging the
+		 * slots with superceeding slots
+		 */
+		for (i = index + nslots - 1; i >= index; i--)
+			io_tlb_list[i] = ++count;
+		/*
+		 * Step 2: merge the returned slots with the preceding slots,
+		 * if available (non zero)
+		 */
+		for (i = index - 1;
+		     (OFFSET(i, IO_TLB_SEGSIZE) !=
+		      IO_TLB_SEGSIZE -1) && io_tlb_list[i];
+		     i--)
+			io_tlb_list[i] = ++count;
+	}
+	spin_unlock_irqrestore(&io_tlb_lock, flags);
+}
+EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single);
+
+void
+swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size,
+			enum dma_data_direction dir,
+			enum dma_sync_target target)
+{
+	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+	phys_addr_t phys = io_tlb_orig_addr[index];
+
+	phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
+
+	switch (target) {
+	case SYNC_FOR_CPU:
+		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+			swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+		else
+			BUG_ON(dir != DMA_TO_DEVICE);
+		break;
+	case SYNC_FOR_DEVICE:
+		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+			swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+		else
+			BUG_ON(dir != DMA_FROM_DEVICE);
+		break;
+	default:
+		BUG();
+	}
+}
+EXPORT_SYMBOL_GPL(swiotlb_tbl_sync_single);
+
+static void
+swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
+	     int do_panic)
+{
+	/*
+	 * Ran out of IOMMU space for this operation. This is very bad.
+	 * Unfortunately the drivers cannot handle this operation properly.
+	 * unless they check for pci_dma_mapping_error (most don't)
+	 * When the mapping is small enough return a static buffer to limit
+	 * the damage, or panic when the transfer is too big.
+	 */
+	printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %zu bytes at "
+	       "device %s\n", size, dev ? dev_name(dev) : "?");
+
+	if (size <= io_tlb_overflow || !do_panic)
+		return;
+
+	if (dir == DMA_BIDIRECTIONAL)
+		panic("DMA: Random memory could be DMA accessed\n");
+	if (dir == DMA_FROM_DEVICE)
+		panic("DMA: Random memory could be DMA written\n");
+	if (dir == DMA_TO_DEVICE)
+		panic("DMA: Random memory could be DMA read\n");
+}
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode.  The
+ * PCI address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+			    unsigned long offset, size_t size,
+			    enum dma_data_direction dir,
+			    struct dma_attrs *attrs)
+{
+	phys_addr_t phys = page_to_pseudophys(page) + offset;
+	dma_addr_t dev_addr = gnttab_dma_map_page(page) + offset;
+	void *map;
+
+	BUG_ON(dir == DMA_NONE);
+
+	/*
+	 * If the address happens to be in the device's DMA window,
+	 * we can safely return the device addr and not worry about bounce
+	 * buffering it.
+	 */
+	if (dma_capable(dev, dev_addr, size) &&
+	    !range_needs_mapping(phys, size))
+		return dev_addr;
+
+	/*
+	 * Oh well, have to allocate and map a bounce buffer.
+	 */
+	gnttab_dma_unmap_page(dev_addr);
+	map = map_single(dev, phys, size, dir);
+	if (!map) {
+		swiotlb_full(dev, size, dir, 1);
+		map = io_tlb_overflow_buffer;
+	}
+
+	dev_addr = swiotlb_virt_to_bus(dev, map);
+
+	/*
+	 * Ensure that the address returned is DMA'ble
+	 */
+	if (!dma_capable(dev, dev_addr, size)) {
+		swiotlb_tbl_unmap_single(dev, map, size, dir);
+		dev_addr = swiotlb_virt_to_bus(dev, io_tlb_overflow_buffer);
+	}
+
+	return dev_addr;
+}
+EXPORT_SYMBOL_GPL(swiotlb_map_page);
+
+/*
+ * Unmap a single streaming mode DMA translation.  The dma_addr and size must
+ * match what was provided for in a previous swiotlb_map_page call.  All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+			 size_t size, enum dma_data_direction dir)
+{
+	phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+
+	if (is_swiotlb_buffer(dev_addr)) {
+		swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+		return;
+	}
+
+	gnttab_dma_unmap_page(dev_addr);
+}
+
+void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+			size_t size, enum dma_data_direction dir,
+			struct dma_attrs *attrs)
+{
+	unmap_single(hwdev, dev_addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a swiotlb_map_page() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
+ * call this function before doing so.  At the next point you give the PCI dma
+ * address back to the card, you must first perform a
+ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static void
+swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+		    size_t size, enum dma_data_direction dir,
+		    enum dma_sync_target target)
+{
+	phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
+
+	BUG_ON(dir == DMA_NONE);
+
+	if (is_swiotlb_buffer(dev_addr))
+		swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
+				       target);
+}
+
+void
+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+			    size_t size, enum dma_data_direction dir)
+{
+	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
+
+void
+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+			       size_t size, enum dma_data_direction dir)
+{
+	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above swiotlb_map_page
+ * interface.  Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length.  They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for swiotlb_map_page are the
+ * same here.
+ */
+int
+swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
+		     enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for_each_sg(sgl, sg, nelems, i) {
+		dma_addr_t dev_addr = gnttab_dma_map_page(sg_page(sg))
+				      + sg->offset;
+		phys_addr_t paddr = page_to_pseudophys(sg_page(sg))
+				   + sg->offset;
+
+		if (range_needs_mapping(paddr, sg->length) ||
+		    !dma_capable(hwdev, dev_addr, sg->length)) {
+			void *map;
+
+			gnttab_dma_unmap_page(dev_addr);
+			map = map_single(hwdev, paddr,
+					 sg->length, dir);
+			if (!map) {
+				/* Don't panic here, we expect map_sg users
+				   to do proper error handling. */
+				swiotlb_full(hwdev, sg->length, dir, 0);
+				swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+						       attrs);
+				sgl[0].dma_length = 0;
+				return 0;
+			}
+			sg->dma_address = swiotlb_virt_to_bus(hwdev, map);
+		} else
+			sg->dma_address = dev_addr;
+		sg->dma_length = sg->length;
+	}
+	return nelems;
+}
+EXPORT_SYMBOL(swiotlb_map_sg_attrs);
+
+int
+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+	       enum dma_data_direction dir)
+{
+	return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL(swiotlb_map_sg);
+
+/*
+ * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
+ */
+void
+swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+		       int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	BUG_ON(dir == DMA_NONE);
+
+	for_each_sg(sgl, sg, nelems, i)
+		unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
+}
+EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
+
+void
+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+		 enum dma_data_direction dir)
+{
+	return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL(swiotlb_unmap_sg);
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+static void
+swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
+		int nelems, enum dma_data_direction dir,
+		enum dma_sync_target target)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nelems, i)
+		swiotlb_sync_single(hwdev, sg->dma_address,
+				    sg->dma_length, dir, target);
+}
+
+void
+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+			int nelems, enum dma_data_direction dir)
+{
+	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
+
+void
+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+			   int nelems, enum dma_data_direction dir)
+{
+	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
+
+int
+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
+{
+	return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer));
+}
+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
+
+/*
+ * Return whether the given PCI device DMA address mask can be supported
+ * properly.  For example, if your device can only drive the low 24-bits
+ * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+swiotlb_dma_supported (struct device *hwdev, u64 mask)
+{
+	return (mask >= ((1UL << dma_bits) - 1));
+}
+EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/mm/Kconfig b/mm/Kconfig
index e338407..c882b8e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -313,7 +313,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
-	depends on X86 && MMU
+	depends on X86 && !XEN && MMU
 	select COMPACTION
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and
@@ -379,3 +379,20 @@ config CLEANCACHE
 	  in a negligible performance hit.
 
 	  If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+	bool "Enable frontswap to cache swap pages if tmem is present"
+	depends on SWAP
+	default n
+	help
+	  Frontswap is so named because it can be thought of as the opposite
+	  of a "backing" store for a swap device.  The data is stored into
+	  "transcendent memory", memory that is not directly accessible or
+	  addressable by the kernel and is of unknown and possibly
+	  time-varying size.  When space in transcendent memory is available,
+	  a significant swap I/O reduction may be achieved.  When none is
+	  available, all frontswap calls are reduced to a single pointer-
+	  compare-against-NULL resulting in a negligible performance hit
+	  and swap data is stored as normal on the matching swap device.
+
+	  If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00e..306742a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 
 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index bcaae4c..5646c74 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -15,29 +15,34 @@
 #include <linux/fs.h>
 #include <linux/exportfs.h>
 #include <linux/mm.h>
+#include <linux/debugfs.h>
 #include <linux/cleancache.h>
 
 /*
  * This global enablement flag may be read thousands of times per second
- * by cleancache_get/put/flush even on systems where cleancache_ops
+ * by cleancache_get/put/invalidate even on systems where cleancache_ops
  * is not claimed (e.g. cleancache is config'ed on but remains
  * disabled), so is preferred to the slower alternative: a function
  * call that checks a non-global.
  */
-int cleancache_enabled;
+int cleancache_enabled __read_mostly;
 EXPORT_SYMBOL(cleancache_enabled);
 
 /*
  * cleancache_ops is set by cleancache_ops_register to contain the pointers
  * to the cleancache "backend" implementation functions.
  */
-static struct cleancache_ops cleancache_ops;
+static struct cleancache_ops cleancache_ops __read_mostly;
 
-/* useful stats available in /sys/kernel/mm/cleancache */
-static unsigned long cleancache_succ_gets;
-static unsigned long cleancache_failed_gets;
-static unsigned long cleancache_puts;
-static unsigned long cleancache_flushes;
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured.  These are for information only so are not protected
+ * against increment races.
+ */
+static u64 cleancache_succ_gets;
+static u64 cleancache_failed_gets;
+static u64 cleancache_puts;
+static u64 cleancache_invalidates;
 
 /*
  * register operations for cleancache, returning previous thus allowing
@@ -148,10 +153,11 @@ void __cleancache_put_page(struct page *page)
 EXPORT_SYMBOL(__cleancache_put_page);
 
 /*
- * Flush any data from cleancache associated with the poolid and the
+ * Invalidate any data from cleancache associated with the poolid and the
  * page's inode and page index so that a subsequent "get" will fail.
  */
-void __cleancache_flush_page(struct address_space *mapping, struct page *page)
+void __cleancache_invalidate_page(struct address_space *mapping,
+					struct page *page)
 {
 	/* careful... page->mapping is NULL sometimes when this is called */
 	int pool_id = mapping->host->i_sb->cleancache_poolid;
@@ -160,85 +166,57 @@ void __cleancache_flush_page(struct address_space *mapping, struct page *page)
 	if (pool_id >= 0) {
 		VM_BUG_ON(!PageLocked(page));
 		if (cleancache_get_key(mapping->host, &key) >= 0) {
-			(*cleancache_ops.flush_page)(pool_id, key, page->index);
-			cleancache_flushes++;
+			(*cleancache_ops.invalidate_page)(pool_id,
+							  key, page->index);
+			cleancache_invalidates++;
 		}
 	}
 }
-EXPORT_SYMBOL(__cleancache_flush_page);
+EXPORT_SYMBOL(__cleancache_invalidate_page);
 
 /*
- * Flush all data from cleancache associated with the poolid and the
+ * Invalidate all data from cleancache associated with the poolid and the
  * mappings's inode so that all subsequent gets to this poolid/inode
  * will fail.
  */
-void __cleancache_flush_inode(struct address_space *mapping)
+void __cleancache_invalidate_inode(struct address_space *mapping)
 {
 	int pool_id = mapping->host->i_sb->cleancache_poolid;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
 	if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
-		(*cleancache_ops.flush_inode)(pool_id, key);
+		(*cleancache_ops.invalidate_inode)(pool_id, key);
 }
-EXPORT_SYMBOL(__cleancache_flush_inode);
+EXPORT_SYMBOL(__cleancache_invalidate_inode);
 
 /*
  * Called by any cleancache-enabled filesystem at time of unmount;
  * note that pool_id is surrendered and may be reutrned by a subsequent
  * cleancache_init_fs or cleancache_init_shared_fs
  */
-void __cleancache_flush_fs(struct super_block *sb)
+void __cleancache_invalidate_fs(struct super_block *sb)
 {
 	if (sb->cleancache_poolid >= 0) {
 		int old_poolid = sb->cleancache_poolid;
 		sb->cleancache_poolid = -1;
-		(*cleancache_ops.flush_fs)(old_poolid);
+		(*cleancache_ops.invalidate_fs)(old_poolid);
 	}
 }
-EXPORT_SYMBOL(__cleancache_flush_fs);
-
-#ifdef CONFIG_SYSFS
-
-/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
-
-#define CLEANCACHE_SYSFS_RO(_name) \
-	static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-		return sprintf(buf, "%lu\n", cleancache_##_name); \
-	} \
-	static struct kobj_attribute cleancache_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = cleancache_##_name##_show, \
-	}
-
-CLEANCACHE_SYSFS_RO(succ_gets);
-CLEANCACHE_SYSFS_RO(failed_gets);
-CLEANCACHE_SYSFS_RO(puts);
-CLEANCACHE_SYSFS_RO(flushes);
-
-static struct attribute *cleancache_attrs[] = {
-	&cleancache_succ_gets_attr.attr,
-	&cleancache_failed_gets_attr.attr,
-	&cleancache_puts_attr.attr,
-	&cleancache_flushes_attr.attr,
-	NULL,
-};
-
-static struct attribute_group cleancache_attr_group = {
-	.attrs = cleancache_attrs,
-	.name = "cleancache",
-};
-
-#endif /* CONFIG_SYSFS */
+EXPORT_SYMBOL(__cleancache_invalidate_fs);
 
 static int __init init_cleancache(void)
 {
-#ifdef CONFIG_SYSFS
-	int err;
-
-	err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
-#endif /* CONFIG_SYSFS */
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *root = debugfs_create_dir("cleancache", NULL);
+	if (root == NULL)
+		return -ENXIO;
+	debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets);
+	debugfs_create_u64("failed_gets", S_IRUGO,
+				root, &cleancache_failed_gets);
+	debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts);
+	debugfs_create_u64("invalidates", S_IRUGO,
+				root, &cleancache_invalidates);
+#endif
 	return 0;
 }
 module_init(init_cleancache)
diff --git a/mm/filemap.c b/mm/filemap.c
index b662757..8cd7e97 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -123,7 +123,7 @@ void __delete_from_page_cache(struct page *page)
 	if (PageUptodate(page) && PageMappedToDisk(page))
 		cleancache_put_page(page);
 	else
-		cleancache_flush_page(mapping, page);
+		cleancache_invalidate_page(mapping, page);
 
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 0000000..5065d1a
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,272 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap.  See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+int frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured.  These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_gets;
+static u64 frontswap_succ_puts;
+static u64 frontswap_failed_puts;
+static u64 frontswap_invalidates;
+
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+	struct frontswap_ops old = frontswap_ops;
+
+	frontswap_ops = *ops;
+	frontswap_enabled = 1;
+	return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/* Called when a swap device is swapon'd */
+void __frontswap_init(unsigned type)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (sis->frontswap_map == NULL)
+		return;
+	if (frontswap_enabled)
+		(*frontswap_ops.init)(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+/*
+ * "Put" data from a page to frontswap and associate it with the page's
+ * swaptype and offset.  Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implmentation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure
+ */
+int __frontswap_put_page(struct page *page)
+{
+	int ret = -1, dup = 0;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset))
+		dup = 1;
+	ret = (*frontswap_ops.put_page)(type, offset, page);
+	if (ret == 0) {
+		frontswap_set(sis, offset);
+		frontswap_succ_puts++;
+		if (!dup)
+			atomic_inc(&sis->frontswap_pages);
+	} else if (dup) {
+		/*
+		  failed dup always results in automatic invalidate of
+		  the (older) page from frontswap
+		 */
+		frontswap_clear(sis, offset);
+		atomic_dec(&sis->frontswap_pages);
+		frontswap_failed_puts++;
+	} else
+		frontswap_failed_puts++;
+	return ret;
+}
+EXPORT_SYMBOL(__frontswap_put_page);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache
+ */
+int __frontswap_get_page(struct page *page)
+{
+	int ret = -1;
+	swp_entry_t entry = { .val = page_private(page), };
+	int type = swp_type(entry);
+	struct swap_info_struct *sis = swap_info[type];
+	pgoff_t offset = swp_offset(entry);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset))
+		ret = (*frontswap_ops.get_page)(type, offset, page);
+	if (ret == 0)
+		frontswap_gets++;
+	return ret;
+}
+EXPORT_SYMBOL(__frontswap_get_page);
+
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (frontswap_test(sis, offset)) {
+		(*frontswap_ops.invalidate_page)(type, offset);
+		atomic_dec(&sis->frontswap_pages);
+		frontswap_clear(sis, offset);
+		frontswap_invalidates++;
+	}
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+	struct swap_info_struct *sis = swap_info[type];
+
+	BUG_ON(sis == NULL);
+	if (sis->frontswap_map == NULL)
+		return;
+	(*frontswap_ops.invalidate_area)(type);
+	atomic_set(&sis->frontswap_pages, 0);
+	memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+	struct swap_info_struct *si = NULL;
+	int si_frontswap_pages;
+	unsigned long total_pages = 0, total_pages_to_unuse;
+	unsigned long pages = 0, pages_to_unuse = 0;
+	int type;
+	bool locked = false;
+
+	/*
+	 * we don't want to hold swap_lock while doing a very
+	 * lengthy try_to_unuse, but swap_list may change
+	 * so restart scan from swap_list.head each time
+	 */
+	spin_lock(&swap_lock);
+	locked = true;
+	total_pages = 0;
+	for (type = swap_list.head; type >= 0; type = si->next) {
+		si = swap_info[type];
+		total_pages += atomic_read(&si->frontswap_pages);
+	}
+	if (total_pages <= target_pages)
+		goto out;
+	total_pages_to_unuse = total_pages - target_pages;
+	for (type = swap_list.head; type >= 0; type = si->next) {
+		si = swap_info[type];
+		si_frontswap_pages = atomic_read(&si->frontswap_pages);
+		if (total_pages_to_unuse < si_frontswap_pages)
+			pages = pages_to_unuse = total_pages_to_unuse;
+		else {
+			pages = si_frontswap_pages;
+			pages_to_unuse = 0; /* unuse all */
+		}
+		/* ensure there is enough RAM to fetch pages from frontswap */
+		if (security_vm_enough_memory_kern(pages))
+			continue;
+		vm_unacct_memory(pages);
+		break;
+	}
+	if (type < 0)
+		goto out;
+	locked = false;
+	spin_unlock(&swap_lock);
+	try_to_unuse(type, true, pages_to_unuse);
+out:
+	if (locked)
+		spin_unlock(&swap_lock);
+	return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices.  This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+	int type;
+	unsigned long totalpages = 0;
+	struct swap_info_struct *si = NULL;
+
+	spin_lock(&swap_lock);
+	for (type = swap_list.head; type >= 0; type = si->next) {
+		si = swap_info[type];
+		totalpages += atomic_read(&si->frontswap_pages);
+	}
+	spin_unlock(&swap_lock);
+	return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+static int __init init_frontswap(void)
+{
+	int err = 0;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *root = debugfs_create_dir("frontswap", NULL);
+	if (root == NULL)
+		return -ENXIO;
+	debugfs_create_u64("gets", S_IRUGO, root, &frontswap_gets);
+	debugfs_create_u64("succ_puts", S_IRUGO, root, &frontswap_succ_puts);
+	debugfs_create_u64("puts", S_IRUGO, root, &frontswap_failed_puts);
+	debugfs_create_u64("invalidates", S_IRUGO,
+				root, &frontswap_invalidates);
+#endif
+	return err;
+}
+
+module_init(init_frontswap);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a56a851..9517a72 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -13,6 +13,10 @@
 #define INIT_MM_CONTEXT(name)
 #endif
 
+#ifdef CONFIG_X86_XEN
+#define swapper_pg_dir ((pgd_t *)NULL)
+#endif
+
 struct mm_struct init_mm = {
 	.mm_rb		= RB_ROOT,
 	.pgd		= swapper_pg_dir,
diff --git a/mm/memory.c b/mm/memory.c
index fa2f04e..4470619 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -802,6 +802,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 {
 	unsigned long pfn = pte_pfn(pte);
 
+#if defined(CONFIG_XEN) && defined(CONFIG_X86)
+	/* XEN: Covers user-space grant mappings (even of local pages). */
+	if (unlikely(vma->vm_flags & VM_FOREIGN))
+		return NULL;
+#endif
+
 	if (HAVE_PTE_SPECIAL) {
 		if (likely(!pte_special(pte)))
 			goto check_pfn;
@@ -833,6 +839,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		return NULL;
 check_pfn:
 	if (unlikely(pfn > highest_memmap_pfn)) {
+#ifdef CONFIG_XEN
+		if (!(vma->vm_flags & VM_RESERVED))
+#endif
 		print_bad_pte(vma, addr, pte, NULL);
 		return NULL;
 	}
@@ -1158,8 +1167,14 @@ again:
 				     page->index > details->last_index))
 					continue;
 			}
-			ptent = ptep_get_and_clear_full(mm, addr, pte,
-							tlb->fullmm);
+#ifdef CONFIG_XEN
+			if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte))
+				ptent = vma->vm_ops->zap_pte(vma, addr, pte,
+							     tlb->fullmm);
+			else
+#endif
+				ptent = ptep_get_and_clear_full(mm, addr, pte,
+								tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
@@ -1404,6 +1419,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 	tlb_finish_mmu(&tlb, address, end);
 	return end;
 }
+EXPORT_SYMBOL(zap_page_range);
 
 /**
  * zap_vma_ptes - remove ptes mapping the vma
@@ -1714,6 +1730,28 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			goto next_page;
 		}
 
+#ifdef CONFIG_XEN
+		if (vma && (vma->vm_flags & VM_FOREIGN)) {
+			struct vm_foreign_map *foreign_map =
+				vma->vm_private_data;
+			struct page **map = foreign_map->map;
+			int offset = (start - vma->vm_start) >> PAGE_SHIFT;
+			if (map[offset] != NULL) {
+			        if (pages) {
+			                struct page *page = map[offset];
+
+					pages[i] = page;
+					get_page(page);
+				}
+				if (vmas)
+					vmas[i] = vma;
+				i++;
+				start += PAGE_SIZE;
+				nr_pages--;
+				continue;
+			}
+		}
+#endif
 		if (!vma ||
 		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
 		    !(vm_flags & vma->vm_flags))
@@ -2400,6 +2438,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 	unsigned long end = addr + size;
 	int err;
 
+#ifdef CONFIG_XEN
+	if (!mm)
+		mm = &init_mm;
+#endif
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	do {
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f758c7..6703eb5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1870,6 +1870,14 @@ static void unmap_region(struct mm_struct *mm,
 	tlb_finish_mmu(&tlb, start, end);
 }
 
+static inline void unmap_vma(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_XEN
+	if (unlikely(vma->vm_ops && vma->vm_ops->unmap))
+		vma->vm_ops->unmap(vma);
+#endif
+}
+
 /*
  * Create a list of vma's touched by the unmap, removing them from the mm's
  * vma list as we go..
@@ -1886,6 +1894,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	vma->vm_prev = NULL;
 	do {
 		rb_erase(&vma->vm_rb, &mm->mm_rb);
+		unmap_vma(vma);
 		mm->map_count--;
 		tail_vma = vma;
 		vma = vma->vm_next;
@@ -2229,6 +2238,11 @@ void exit_mmap(struct mm_struct *mm)
 
 	arch_exit_mmap(mm);
 
+#ifdef CONFIG_XEN
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		unmap_vma(vma);
+#endif
+
 	vma = mm->mmap;
 	if (!vma)	/* Can happen if dup_mmap() received an OOM */
 		return;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9a81db..dae6d3d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -692,6 +692,13 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 	int i;
 	int bad = 0;
 
+#ifdef CONFIG_XEN
+	if (PageForeign(page)) {
+		PageForeignDestructor(page, order);
+		return false;
+	}
+#endif
+
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
 
@@ -718,6 +725,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	unsigned long flags;
 	int wasMlocked = __TestClearPageMlocked(page);
 
+#ifdef CONFIG_XEN
+	WARN_ON(PageForeign(page) && wasMlocked);
+#endif
 	if (!free_pages_prepare(page, order))
 		return;
 
@@ -1216,6 +1226,9 @@ void free_hot_cold_page(struct page *page, int cold)
 	int migratetype;
 	int wasMlocked = __TestClearPageMlocked(page);
 
+#ifdef CONFIG_XEN
+	WARN_ON(PageForeign(page) && wasMlocked);
+#endif
 	if (!free_pages_prepare(page, 0))
 		return;
 
@@ -4996,6 +5009,22 @@ void setup_per_zone_wmarks(void)
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 
+#ifdef CONFIG_XEN
+	for_each_populated_zone(zone) {
+		unsigned int cpu;
+
+		for_each_online_cpu(cpu) {
+			unsigned long high;
+
+			high = percpu_pagelist_fraction
+			       ? zone->present_pages / percpu_pagelist_fraction
+			       : 5 * zone_batchsize(zone);
+			setup_pagelist_highmark(
+				per_cpu_ptr(zone->pageset, cpu), high);
+		}
+	}
+#endif
+
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d..651a912 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/swapops.h>
 #include <linux/writeback.h>
+#include <linux/frontswap.h>
 #include <asm/pgtable.h>
 
 static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		unlock_page(page);
 		goto out;
 	}
+	if (frontswap_put_page(page) == 0) {
+		set_page_writeback(page);
+		unlock_page(page);
+		end_page_writeback(page);
+		goto out;
+	}
 	bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
 	if (bio == NULL) {
 		set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
 
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(PageUptodate(page));
+	if (frontswap_get_page(page) == 0) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		goto out;
+	}
 	bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
 	if (bio == NULL) {
 		unlock_page(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d999f09..fe69bb3 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
 #include <linux/memcontrol.h>
 #include <linux/poll.h>
 #include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 static void free_swap_count_continuations(struct swap_info_struct *);
 static sector_t map_swap_entry(swp_entry_t, struct block_device**);
 
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
 
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
 
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
 
 static DEFINE_MUTEX(swapon_mutex);
 
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
 			swap_list.next = p->type;
 		nr_swap_pages++;
 		p->inuse_pages--;
+		frontswap_invalidate_page(p->type, offset);
 		if ((p->flags & SWP_BLKDEV) &&
 				disk->fops->swap_slot_free_notify)
 			disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -1018,11 +1021,12 @@ static int unuse_mm(struct mm_struct *mm,
 }
 
 /*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
  */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-					unsigned int prev)
+					unsigned int prev, bool frontswap)
 {
 	unsigned int max = si->max;
 	unsigned int i = prev;
@@ -1048,6 +1052,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 			prev = 0;
 			i = 1;
 		}
+		if (frontswap) {
+			if (frontswap_test(si, i))
+				break;
+			else
+				continue;
+		}
 		count = si->swap_map[i];
 		if (count && swap_count(count) != SWAP_MAP_BAD)
 			break;
@@ -1059,8 +1069,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
  * We completely avoid races by reading each swap page in advance,
  * and then search for the process using it.  All the necessary
  * page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
  */
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+		 unsigned long pages_to_unuse)
 {
 	struct swap_info_struct *si = swap_info[type];
 	struct mm_struct *start_mm;
@@ -1093,7 +1107,7 @@ static int try_to_unuse(unsigned int type)
 	 * one pass through swap_map is enough, but not necessarily:
 	 * there are races when an instance of an entry might be missed.
 	 */
-	while ((i = find_next_to_unuse(si, i)) != 0) {
+	while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
 		if (signal_pending(current)) {
 			retval = -EINTR;
 			break;
@@ -1260,6 +1274,10 @@ static int try_to_unuse(unsigned int type)
 		 * interactive performance.
 		 */
 		cond_resched();
+		if (frontswap && pages_to_unuse > 0) {
+			if (!--pages_to_unuse)
+				break;
+		}
 	}
 
 	mmput(start_mm);
@@ -1519,7 +1537,8 @@ bad_bmap:
 }
 
 static void enable_swap_info(struct swap_info_struct *p, int prio,
-				unsigned char *swap_map)
+				unsigned char *swap_map,
+				unsigned long *frontswap_map)
 {
 	int i, prev;
 
@@ -1529,6 +1548,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 	else
 		p->prio = --least_priority;
 	p->swap_map = swap_map;
+	frontswap_map_set(p, frontswap_map);
 	p->flags |= SWP_WRITEOK;
 	nr_swap_pages += p->pages;
 	total_swap_pages += p->pages;
@@ -1545,6 +1565,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 		swap_list.head = swap_list.next = p->type;
 	else
 		swap_info[prev]->next = p->type;
+	frontswap_init(p->type);
 	spin_unlock(&swap_lock);
 }
 
@@ -1616,7 +1637,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	spin_unlock(&swap_lock);
 
 	oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
-	err = try_to_unuse(type);
+	err = try_to_unuse(type, false, 0); /* force all pages to be unused */
 	compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
 
 	if (err) {
@@ -1627,7 +1648,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 		 * sys_swapoff for this swap_info_struct at this point.
 		 */
 		/* re-insert swap space back into swap_list */
-		enable_swap_info(p, p->prio, p->swap_map);
+		enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
 		goto out_dput;
 	}
 
@@ -1653,9 +1674,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
+	frontswap_invalidate_area(type);
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+	vfree(frontswap_map_get(p));
 	/* Destroy swap account informatin */
 	swap_cgroup_swapoff(type);
 
@@ -2019,6 +2042,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	sector_t span;
 	unsigned long maxpages;
 	unsigned char *swap_map = NULL;
+	unsigned long *frontswap_map = NULL;
 	struct page *page = NULL;
 	struct inode *inode = NULL;
 
@@ -2099,6 +2123,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		error = nr_extents;
 		goto bad_swap;
 	}
+	/* frontswap enabled? set up bit-per-page map for frontswap */
+	if (frontswap_enabled)
+		frontswap_map = vzalloc(maxpages / sizeof(long));
 
 	if (p->bdev) {
 		if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2114,14 +2141,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (swap_flags & SWAP_FLAG_PREFER)
 		prio =
 		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
-	enable_swap_info(p, prio, swap_map);
+	enable_swap_info(p, prio, swap_map, frontswap_map);
 
 	printk(KERN_INFO "Adding %uk swap on %s.  "
-			"Priority:%d extents:%d across:%lluk %s%s\n",
+			"Priority:%d extents:%d across:%lluk %s%s%s\n",
 		p->pages<<(PAGE_SHIFT-10), name, p->prio,
 		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
 		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
-		(p->flags & SWP_DISCARDABLE) ? "D" : "");
+		(p->flags & SWP_DISCARDABLE) ? "D" : "",
+		(frontswap_map) ? "FS" : "");
 
 	mutex_unlock(&swapon_mutex);
 	atomic_inc(&proc_poll_event);
@@ -2312,6 +2340,10 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		base++;
 
 	spin_lock(&swap_lock);
+	if (frontswap_test(si, target)) {
+		spin_unlock(&swap_lock);
+		return 0;
+	}
 	if (end > si->max)	/* don't go beyond end of map */
 		end = si->max;
 
@@ -2322,6 +2354,9 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 			break;
 		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
 			break;
+		/* Don't read in frontswap pages */
+		if (frontswap_test(si, toff))
+			break;
 	}
 	/* Count contiguous allocated slots below our target */
 	for (toff = target; --toff >= base; nr_pages++) {
@@ -2330,6 +2365,9 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 			break;
 		if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
 			break;
+		/* Don't read in frontswap pages */
+		if (frontswap_test(si, toff))
+			break;
 	}
 	spin_unlock(&swap_lock);
 
diff --git a/mm/tmem-xen.c b/mm/tmem-xen.c
new file mode 100644
index 0000000..d79398a
--- /dev/null
+++ b/mm/tmem-xen.c
@@ -0,0 +1,56 @@
+/*
+ * Xen implementation for transcendent memory (tmem)
+ *
+ * Dan Magenheimer <dan.magenheimer@oracle.com> 2009
+ */
+
+#include <linux/types.h>
+#include <xen/interface/xen.h>
+#include <asm/hypervisor.h>
+#include "tmem.h"
+
+int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, u32 index,
+	unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
+{
+	struct tmem_op op;
+	int rc = 0;
+
+	op.cmd = tmem_cmd;
+	op.pool_id = tmem_pool;
+	BUILD_BUG_ON(sizeof(op.u.gen.oid) != sizeof(oid.oid));
+	memcpy(op.u.gen.oid, oid.oid, sizeof(op.u.gen.oid));
+	op.u.gen.index = index;
+	op.u.gen.tmem_offset = tmem_offset;
+	op.u.gen.pfn_offset = pfn_offset;
+	op.u.gen.len = len;
+	op.u.gen.cmfn = gmfn;
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
+
+int xen_tmem_new_pool(uint32_t tmem_cmd, struct tmem_pool_uuid uuid,
+	uint32_t flags)
+{
+	struct tmem_op op;
+	int rc = 0;
+
+	op.cmd = tmem_cmd;
+	op.u.creat.uuid[0] = uuid.lo;
+	op.u.creat.uuid[1] = uuid.hi;
+#ifdef TMEM_SPEC_VERSION
+	switch (flags >> TMEM_POOL_VERSION_SHIFT) {
+	case 0:
+		flags |= TMEM_SPEC_VERSION << TMEM_POOL_VERSION_SHIFT;
+		break;
+	case TMEM_SPEC_VERSION:
+		break;
+	default:
+		WARN(1, "TMEM: Bogus version %u, expecting %u\n",
+		     flags >> TMEM_POOL_VERSION_SHIFT, TMEM_SPEC_VERSION);
+		return -ENOSYS;
+	}
+#endif
+	op.u.creat.flags = flags;
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
diff --git a/mm/truncate.c b/mm/truncate.c
index 1f07994..a03380a 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -52,7 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-	cleancache_flush_page(page->mapping, page);
+	cleancache_invalidate_page(page->mapping, page);
 	if (page_has_private(page))
 		do_invalidatepage(page, partial);
 }
@@ -213,7 +213,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	pgoff_t end;
 	int i;
 
-	cleancache_flush_inode(mapping);
+	cleancache_invalidate_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;
 
@@ -292,7 +292,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		mem_cgroup_uncharge_end();
 		index++;
 	}
-	cleancache_flush_inode(mapping);
+	cleancache_invalidate_inode(mapping);
 	/*
 	 * Cycle the tree_lock to make sure all __delete_from_page_cache()
 	 * calls run from page reclaim have finished as well (this handles the
@@ -451,7 +451,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	int ret2 = 0;
 	int did_range_unmap = 0;
 
-	cleancache_flush_inode(mapping);
+	cleancache_invalidate_inode(mapping);
 	pagevec_init(&pvec, 0);
 	index = start;
 	while (index <= end && pagevec_lookup(&pvec, mapping, index,
@@ -507,7 +507,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 		cond_resched();
 		index++;
 	}
-	cleancache_flush_inode(mapping);
+	cleancache_invalidate_inode(mapping);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 86ce9a5..b031439 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1575,6 +1575,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
 	gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+#ifdef CONFIG_XEN
+	gfp_t dma_mask = gfp_mask & (__GFP_DMA | __GFP_DMA32);
+
+	BUILD_BUG_ON((__GFP_DMA | __GFP_DMA32) != (__GFP_DMA + __GFP_DMA32));
+	if (dma_mask == (__GFP_DMA | __GFP_DMA32))
+		gfp_mask &= ~(__GFP_DMA | __GFP_DMA32);
+#endif
 
 	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
 	array_size = (nr_pages * sizeof(struct page *));
@@ -1611,6 +1618,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 			goto fail;
 		}
 		area->pages[i] = page;
+#ifdef CONFIG_XEN
+		if (dma_mask) {
+			if (xen_limit_pages_to_max_mfn(page, 0, 32)) {
+				area->nr_pages = i + 1;
+				goto fail;
+			}
+			if (gfp_mask & __GFP_ZERO)
+				clear_highpage(page);
+		}
+#endif
 	}
 
 	if (map_vm_area(area, prot, &pages))
@@ -1836,6 +1853,8 @@ void *vmalloc_exec(unsigned long size)
 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
 #define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
+#elif defined(CONFIG_XEN)
+#define GFP_VMALLOC32 GFP_DMA | GFP_DMA32 | GFP_KERNEL
 #else
 #define GFP_VMALLOC32 GFP_KERNEL
 #endif
@@ -2210,6 +2229,17 @@ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
 		return NULL;
 	}
 
+#ifdef CONFIG_XEN
+	/*
+	 * If the allocated address space is passed to a hypercall before
+	 * being used then we cannot rely on a page fault to trigger an update
+	 * of the page tables.  So sync all the page tables here unless the
+	 * caller is going to have the affected PTEs updated directly.
+	 */
+	if (!ptes)
+		vmalloc_sync_all();
+#endif
+
 	return area;
 }
 EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c02280a..898b770 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2932,6 +2932,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
 		goto out;
 
 	if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
+	    !(dev->flags&IFF_MULTICAST) ||
 	    idev->cnf.accept_dad < 1 ||
 	    !(ifp->flags&IFA_F_TENTATIVE) ||
 	    ifp->flags & IFA_F_NODAD) {
@@ -3035,6 +3036,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
 	     ifp->idev->cnf.accept_ra == 2) &&
 	    ifp->idev->cnf.rtr_solicits > 0 &&
 	    (dev->flags&IFF_LOOPBACK) == 0 &&
+	    (dev->flags&IFF_MULTICAST) &&
 	    (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
 		/*
 		 *	If a host as already performed a random delay
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 9a83f6c..7bede6d 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -126,6 +126,21 @@ ifndef obj
 $(warning kbuild: Makefile.build is included improperly)
 endif
 
+ifeq ($(CONFIG_XEN),y)
+Makefile.xen := $(if $(KBUILD_EXTMOD),$(KBUILD_EXTMOD),$(objtree)/scripts)/Makefile.xen
+$(Makefile.xen): $(srctree)/scripts/Makefile.xen.awk $(srctree)/scripts/Makefile.build
+	@echo '  Updating $@'
+	$(if $(shell echo a | $(AWK) '{ print gensub(/a/, "AA", "g"); }'),\
+        ,$(error 'Your awk program does not define gensub.  Use gawk or another awk with gensub'))
+	@$(AWK) -f $< $(filter-out $<,$^) >$@
+
+xen-src-single-used-m	:= $(patsubst $(srctree)/%,%,$(wildcard $(addprefix $(srctree)/,$(single-used-m:.o=-xen.c))))
+xen-single-used-m	:= $(xen-src-single-used-m:-xen.c=.o)
+single-used-m		:= $(filter-out $(xen-single-used-m),$(single-used-m))
+
+-include $(Makefile.xen)
+endif
+
 # ===========================================================================
 
 ifneq ($(strip $(lib-y) $(lib-m) $(lib-n) $(lib-)),)
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 00c368c..9685541 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -22,6 +22,12 @@ obj-m := $(filter-out $(obj-y),$(obj-m))
 
 lib-y := $(filter-out $(obj-y), $(sort $(lib-y) $(lib-m)))
 
+# Remove objects forcibly disabled
+
+obj-y := $(filter-out $(disabled-obj-y),$(obj-y))
+obj-m := $(filter-out $(disabled-obj-y),$(obj-m))
+lib-y := $(filter-out $(disabled-obj-y),$(lib-y))
+
 
 # Handle objects in subdirs
 # ---------------------------------------------------------------------------
diff --git a/scripts/Makefile.xen.awk b/scripts/Makefile.xen.awk
new file mode 100644
index 0000000..1f7cf1e
--- /dev/null
+++ b/scripts/Makefile.xen.awk
@@ -0,0 +1,34 @@
+BEGIN {
+	is_rule = 0
+}
+
+/^[[:space:]]*#/ {
+	next
+}
+
+/^[[:space:]]*$/ {
+	if (is_rule)
+		print("")
+	is_rule = 0
+	next
+}
+
+/:[[:space:]]*\$\(src\)\/%\.[cS][[:space:]]/ {
+	line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
+	line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
+	print line
+	is_rule = 1
+	next
+}
+
+/^[^\t]$/ {
+	if (is_rule)
+		print("")
+	is_rule = 0
+	next
+}
+
+is_rule {
+	print $0
+	next
+}
-- 
1.7.10.4