- Update to 3.4-rc7.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / xen / enlighten.c
index d99522e..6fbd402 100644 (file)
@@ -11,6 +11,7 @@
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
 
+#include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/console.h>
 #include <linux/pci.h>
 #include <linux/gfp.h>
+#include <linux/memblock.h>
 
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/version.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
+#include <xen/interface/memory.h>
 #include <xen/features.h>
 #include <xen/page.h>
+#include <xen/hvm.h>
 #include <xen/hvc-console.h>
 
 #include <asm/paravirt.h>
 #include <asm/apic.h>
 #include <asm/page.h>
+#include <asm/xen/pci.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/fixmap.h>
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
 #include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
+#include <asm/mwait.h>
+#include <asm/pci_x86.h>
+
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#include <asm/acpi.h>
+#include <acpi/pdc_intel.h>
+#include <acpi/processor.h>
+#include <xen/interface/platform.h>
+#endif
 
 #include "xen-ops.h"
 #include "mmu.h"
@@ -69,6 +85,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 enum xen_domain_type xen_domain_type = XEN_NATIVE;
 EXPORT_SYMBOL_GPL(xen_domain_type);
 
+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned long  machine_to_phys_nr;
+EXPORT_SYMBOL(machine_to_phys_nr);
+
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
 
@@ -76,6 +97,10 @@ struct shared_info xen_dummy_shared_info;
 
 void *xen_initial_gdt;
 
+RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
 /*
  * Point at some empty memory to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
@@ -100,8 +125,8 @@ static int have_vcpu_info_placement = 1;
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
-       if (setup_max_cpus > MAX_VIRT_CPUS)
-               setup_max_cpus = MAX_VIRT_CPUS;
+       if (setup_max_cpus > XEN_LEGACY_MAX_VCPUS)
+               setup_max_cpus = XEN_LEGACY_MAX_VCPUS;
 #endif
 }
 
@@ -113,11 +138,11 @@ static void xen_vcpu_setup(int cpu)
 
        BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
-       if (cpu < MAX_VIRT_CPUS)
+       if (cpu < XEN_LEGACY_MAX_VCPUS)
                per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
 
        if (!have_vcpu_info_placement) {
-               if (cpu >= MAX_VIRT_CPUS)
+               if (cpu >= XEN_LEGACY_MAX_VCPUS)
                        clamp_max_cpus();
                return;
        }
@@ -126,9 +151,6 @@ static void xen_vcpu_setup(int cpu)
        info.mfn = arbitrary_virt_to_mfn(vcpup);
        info.offset = offset_in_page(vcpup);
 
-       printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
-              cpu, vcpup, info.mfn, info.offset);
-
        /* Check to see if the hypervisor will put the vcpu_info
           structure where we want it, which allows direct access via
           a percpu-variable. */
@@ -142,9 +164,6 @@ static void xen_vcpu_setup(int cpu)
                /* This cpu is using the registered vcpu info, even if
                   later ones fail to. */
                per_cpu(xen_vcpu, cpu) = vcpup;
-
-               printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
-                      cpu, vcpup);
        }
 }
 
@@ -191,13 +210,17 @@ static void __init xen_banner(void)
 static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
 static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
 
+static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
+static __read_mostly unsigned int cpuid_leaf5_ecx_val;
+static __read_mostly unsigned int cpuid_leaf5_edx_val;
+
 static void xen_cpuid(unsigned int *ax, unsigned int *bx,
                      unsigned int *cx, unsigned int *dx)
 {
        unsigned maskebx = ~0;
        unsigned maskecx = ~0;
        unsigned maskedx = ~0;
-
+       unsigned setecx = 0;
        /*
         * Mask out inconvenient features, to try and disable as many
         * unsupported kernel subsystems as possible.
@@ -205,9 +228,18 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
        switch (*ax) {
        case 1:
                maskecx = cpuid_leaf1_ecx_mask;
+               setecx = cpuid_leaf1_ecx_set_mask;
                maskedx = cpuid_leaf1_edx_mask;
                break;
 
+       case CPUID_MWAIT_LEAF:
+               /* Synthesize the values.. */
+               *ax = 0;
+               *bx = 0;
+               *cx = cpuid_leaf5_ecx_val;
+               *dx = cpuid_leaf5_edx_val;
+               return;
+
        case 0xb:
                /* Suppress extended topology stuff */
                maskebx = 0;
@@ -223,40 +255,104 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 
        *bx &= maskebx;
        *cx &= maskecx;
+       *cx |= setecx;
        *dx &= maskedx;
+
 }
 
-static __init void xen_init_cpuid_mask(void)
+static bool __init xen_check_mwait(void)
+{
+#if defined(CONFIG_ACPI) && !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR) && \
+       !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR_MODULE)
+       struct xen_platform_op op = {
+               .cmd                    = XENPF_set_processor_pminfo,
+               .u.set_pminfo.id        = -1,
+               .u.set_pminfo.type      = XEN_PM_PDC,
+       };
+       uint32_t buf[3];
+       unsigned int ax, bx, cx, dx;
+       unsigned int mwait_mask;
+
+       /* We need to determine whether it is OK to expose the MWAIT
+        * capability to the kernel to harvest deeper than C3 states from ACPI
+        * _CST using the processor_harvest_xen.c module. For this to work, we
+        * need to gather the MWAIT_LEAF values (which the cstate.c code
+        * checks against). The hypervisor won't expose the MWAIT flag because
+        * it would break backwards compatibility; so we will find out directly
+        * from the hardware and hypercall.
+        */
+       if (!xen_initial_domain())
+               return false;
+
+       ax = 1;
+       cx = 0;
+
+       native_cpuid(&ax, &bx, &cx, &dx);
+
+       mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
+                    (1 << (X86_FEATURE_MWAIT % 32));
+
+       if ((cx & mwait_mask) != mwait_mask)
+               return false;
+
+       /* We need to emulate the MWAIT_LEAF and for that we need both
+        * ecx and edx. The hypercall provides only partial information.
+        */
+
+       ax = CPUID_MWAIT_LEAF;
+       bx = 0;
+       cx = 0;
+       dx = 0;
+
+       native_cpuid(&ax, &bx, &cx, &dx);
+
+       /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+        * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
+        */
+       buf[0] = ACPI_PDC_REVISION_ID;
+       buf[1] = 1;
+       buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+
+       set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
+
+       if ((HYPERVISOR_dom0_op(&op) == 0) &&
+           (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+               cpuid_leaf5_ecx_val = cx;
+               cpuid_leaf5_edx_val = dx;
+       }
+       return true;
+#else
+       return false;
+#endif
+}
+static void __init xen_init_cpuid_mask(void)
 {
        unsigned int ax, bx, cx, dx;
+       unsigned int xsave_mask;
 
        cpuid_leaf1_edx_mask =
                ~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
                  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
+                 (1 << X86_FEATURE_MTRR) |  /* disable MTRR */
                  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
 
        if (!xen_initial_domain())
                cpuid_leaf1_edx_mask &=
                        ~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
                          (1 << X86_FEATURE_ACPI));  /* disable ACPI */
-
        ax = 1;
        cx = 0;
        xen_cpuid(&ax, &bx, &cx, &dx);
 
-       /* cpuid claims we support xsave; try enabling it to see what happens */
-       if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
-               unsigned long cr4;
-
-               set_in_cr4(X86_CR4_OSXSAVE);
-               
-               cr4 = read_cr4();
-
-               if ((cr4 & X86_CR4_OSXSAVE) == 0)
-                       cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
+       xsave_mask =
+               (1 << (X86_FEATURE_XSAVE % 32)) |
+               (1 << (X86_FEATURE_OSXSAVE % 32));
 
-               clear_in_cr4(X86_CR4_OSXSAVE);
-       }
+       /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+       if ((cx & xsave_mask) != xsave_mask)
+               cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
+       if (xen_check_mwait())
+               cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
 }
 
 static void xen_set_debugreg(int reg, unsigned long val)
@@ -338,6 +434,8 @@ static void xen_set_ldt(const void *addr, unsigned entries)
        struct mmuext_op *op;
        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
 
+       trace_xen_cpu_set_ldt(addr, entries);
+
        op = mcs.args;
        op->cmd = MMUEXT_SET_LDT;
        op->arg1.linear_addr = (unsigned long)addr;
@@ -397,7 +495,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 /*
  * load_gdt for early boot, when the gdt is only mapped once
  */
-static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
 {
        unsigned long va = dtr->address;
        unsigned int size = dtr->size + 1;
@@ -493,6 +591,8 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
        xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
        u64 entry = *(u64 *)ptr;
 
+       trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
+
        preempt_disable();
 
        xen_mc_flush();
@@ -562,10 +662,12 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
        unsigned long p = (unsigned long)&dt[entrynum];
        unsigned long start, end;
 
+       trace_xen_cpu_write_idt_entry(dt, entrynum, g);
+
        preempt_disable();
 
-       start = __get_cpu_var(idt_desc).address;
-       end = start + __get_cpu_var(idt_desc).size + 1;
+       start = __this_cpu_read(idt_desc.address);
+       end = start + __this_cpu_read(idt_desc.size) + 1;
 
        xen_mc_flush();
 
@@ -616,6 +718,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
        static DEFINE_SPINLOCK(lock);
        static struct trap_info traps[257];
 
+       trace_xen_cpu_load_idt(desc);
+
        spin_lock(&lock);
 
        __get_cpu_var(idt_desc) = *desc;
@@ -634,6 +738,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
 static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
                                const void *desc, int type)
 {
+       trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
        preempt_disable();
 
        switch (type) {
@@ -659,9 +765,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
  * Version of write_gdt_entry for use at early boot-time needed to
  * update an entry as simply as possible.
  */
-static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
                                            const void *desc, int type)
 {
+       trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
        switch (type) {
        case DESC_LDT:
        case DESC_TSS:
@@ -681,7 +789,9 @@ static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
 static void xen_load_sp0(struct tss_struct *tss,
                         struct thread_struct *thread)
 {
-       struct multicall_space mcs = xen_mc_entry(0);
+       struct multicall_space mcs;
+
+       mcs = xen_mc_entry(0);
        MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
        xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
@@ -700,9 +810,40 @@ static void xen_io_delay(void)
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_set_apic_id(unsigned int x)
+{
+       WARN_ON(1);
+       return x;
+}
+static unsigned int xen_get_apic_id(unsigned long x)
+{
+       return ((x)>>24) & 0xFFu;
+}
 static u32 xen_apic_read(u32 reg)
 {
-       return 0;
+       struct xen_platform_op op = {
+               .cmd = XENPF_get_cpuinfo,
+               .interface_version = XENPF_INTERFACE_VERSION,
+               .u.pcpu_info.xen_cpuid = 0,
+       };
+       int ret = 0;
+
+       /* Shouldn't need this as APIC is turned off for PV, and we only
+        * get called on the bootup processor. But just in case. */
+       if (!xen_initial_domain() || smp_processor_id())
+               return 0;
+
+       if (reg == APIC_LVR)
+               return 0x10;
+
+       if (reg != APIC_ID)
+               return 0;
+
+       ret = HYPERVISOR_dom0_op(&op);
+       if (ret)
+               return 0;
+
+       return op.u.pcpu_info.apic_id << 24;
 }
 
 static void xen_apic_write(u32 reg, u32 val)
@@ -740,6 +881,8 @@ static void set_xen_basic_apic_ops(void)
        apic->icr_write = xen_apic_icr_write;
        apic->wait_icr_idle = xen_apic_wait_icr_idle;
        apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
+       apic->set_apic_id = xen_set_apic_id;
+       apic->get_apic_id = xen_get_apic_id;
 }
 
 #endif
@@ -759,11 +902,11 @@ static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
 
 static unsigned long xen_read_cr0(void)
 {
-       unsigned long cr0 = percpu_read(xen_cr0_value);
+       unsigned long cr0 = this_cpu_read(xen_cr0_value);
 
        if (unlikely(cr0 == 0)) {
                cr0 = native_read_cr0();
-               percpu_write(xen_cr0_value, cr0);
+               this_cpu_write(xen_cr0_value, cr0);
        }
 
        return cr0;
@@ -773,7 +916,7 @@ static void xen_write_cr0(unsigned long cr0)
 {
        struct multicall_space mcs;
 
-       percpu_write(xen_cr0_value, cr0);
+       this_cpu_write(xen_cr0_value, cr0);
 
        /* Only pay attention to cr0.TS; everything else is
           ignored. */
@@ -826,6 +969,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
                   Xen console noise. */
                break;
 
+       case MSR_IA32_CR_PAT:
+               if (smp_processor_id() == 0)
+                       xen_set_pat(((u64)high << 32) | low);
+               break;
+
        default:
                ret = native_write_msr_safe(msr, low, high);
        }
@@ -853,7 +1001,7 @@ void xen_setup_shared_info(void)
        xen_setup_mfn_list_list();
 }
 
-/* This is called once we have the cpu_possible_map */
+/* This is called once we have the cpu_possible_mask */
 void xen_setup_vcpu_info_placement(void)
 {
        int cpu;
@@ -864,8 +1012,6 @@ void xen_setup_vcpu_info_placement(void)
        /* xen_vcpu_setup managed to place the vcpu_info within the
           percpu area for all cpus, so make use of it */
        if (have_vcpu_info_placement) {
-               printk(KERN_INFO "Xen: using vcpu_info placement\n");
-
                pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
                pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
                pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -927,22 +1073,22 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
        return ret;
 }
 
-static const struct pv_info xen_info __initdata = {
+static const struct pv_info xen_info __initconst = {
        .paravirt_enabled = 1,
        .shared_kernel_pmd = 0,
 
+#ifdef CONFIG_X86_64
+       .extra_user_64bit_cs = FLAT_USER_CS64,
+#endif
+
        .name = "Xen",
 };
 
-static const struct pv_init_ops xen_init_ops __initdata = {
+static const struct pv_init_ops xen_init_ops __initconst = {
        .patch = xen_patch,
 };
 
-static const struct pv_time_ops xen_time_ops __initdata = {
-       .sched_clock = xen_clocksource_read,
-};
-
-static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
        .cpuid = xen_cpuid,
 
        .set_debugreg = xen_set_debugreg,
@@ -1002,7 +1148,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        .end_context_switch = xen_end_context_switch,
 };
 
-static const struct pv_apic_ops xen_apic_ops __initdata = {
+static const struct pv_apic_ops xen_apic_ops __initconst = {
 #ifdef CONFIG_X86_LOCAL_APIC
        .startup_ipi_hook = paravirt_nop,
 #endif
@@ -1012,10 +1158,6 @@ static void xen_reboot(int reason)
 {
        struct sched_shutdown r = { .reason = reason };
 
-#ifdef CONFIG_SMP
-       smp_send_stop();
-#endif
-
        if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
                BUG();
 }
@@ -1035,6 +1177,13 @@ static void xen_machine_halt(void)
        xen_reboot(SHUTDOWN_poweroff);
 }
 
+static void xen_machine_power_off(void)
+{
+       if (pm_power_off)
+               pm_power_off();
+       xen_reboot(SHUTDOWN_poweroff);
+}
+
 static void xen_crash_shutdown(struct pt_regs *regs)
 {
        xen_reboot(SHUTDOWN_crash);
@@ -1043,10 +1192,7 @@ static void xen_crash_shutdown(struct pt_regs *regs)
 static int
 xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
-       struct sched_shutdown r = { .reason = SHUTDOWN_crash};
-
-       if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
-               BUG();
+       xen_reboot(SHUTDOWN_crash);
        return NOTIFY_DONE;
 }
 
@@ -1060,10 +1206,10 @@ int xen_panic_handler_init(void)
        return 0;
 }
 
-static const struct machine_ops __initdata xen_machine_ops = {
+static const struct machine_ops xen_machine_ops __initconst = {
        .restart = xen_restart,
        .halt = xen_machine_halt,
-       .power_off = xen_machine_halt,
+       .power_off = xen_machine_power_off,
        .shutdown = xen_machine_halt,
        .crash_shutdown = xen_crash_shutdown,
        .emergency_restart = xen_emergency_restart,
@@ -1089,6 +1235,8 @@ static void __init xen_setup_stackprotector(void)
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
+       struct physdev_set_iopl set_iopl;
+       int rc;
        pgd_t *pgd;
 
        if (!xen_start_info)
@@ -1096,10 +1244,11 @@ asmlinkage void __init xen_start_kernel(void)
 
        xen_domain_type = XEN_PV_DOMAIN;
 
+       xen_setup_machphys_mapping();
+
        /* Install Xen paravirt ops */
        pv_info = xen_info;
        pv_init_ops = xen_init_ops;
-       pv_time_ops = xen_time_ops;
        pv_cpu_ops = xen_cpu_ops;
        pv_apic_ops = xen_apic_ops;
 
@@ -1107,13 +1256,7 @@ asmlinkage void __init xen_start_kernel(void)
        x86_init.oem.arch_setup = xen_arch_setup;
        x86_init.oem.banner = xen_banner;
 
-       x86_init.timers.timer_init = xen_time_init;
-       x86_init.timers.setup_percpu_clockev = x86_init_noop;
-       x86_cpuinit.setup_percpu_clockev = x86_init_noop;
-
-       x86_platform.calibrate_tsc = xen_tsc_khz;
-       x86_platform.get_wallclock = xen_get_wallclock;
-       x86_platform.set_wallclock = xen_set_wallclock;
+       xen_init_time_ops();
 
        /*
         * Set up some pagetable state before starting to set any ptes.
@@ -1123,7 +1266,9 @@ asmlinkage void __init xen_start_kernel(void)
 
        /* Prevent unwanted bits from being set in PTEs. */
        __supported_pte_mask &= ~_PAGE_GLOBAL;
+#if 0
        if (!xen_initial_domain())
+#endif
                __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
        __supported_pte_mask |= _PAGE_IOMAP;
@@ -1175,6 +1320,15 @@ asmlinkage void __init xen_start_kernel(void)
 
        xen_smp_init();
 
+#ifdef CONFIG_ACPI_NUMA
+       /*
+        * The pages we from Xen are not related to machine pages, so
+        * any NUMA information the kernel tries to get from ACPI will
+        * be meaningless.  Prevent it from trying.
+        */
+       acpi_numa = -1;
+#endif
+
        pgd = (pgd_t *)xen_start_info->pt_base;
 
        /* Don't do the full vcpu_info placement stuff until we have a
@@ -1182,12 +1336,14 @@ asmlinkage void __init xen_start_kernel(void)
        per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
 
        local_irq_disable();
-       early_boot_irqs_off();
+       early_boot_irqs_disabled = true;
 
        xen_raw_console_write("mapping kernel into physical memory\n");
        pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
+       xen_ident_map_ISA();
 
-       init_mm.pgd = pgd;
+       /* Allocate and initialize top and mid mfn levels for p2m structure */
+       xen_build_mfn_list_list();
 
        /* keep using Xen gdt for now; no urgent need to change it */
 
@@ -1198,10 +1354,18 @@ asmlinkage void __init xen_start_kernel(void)
 #else
        pv_info.kernel_rpl = 0;
 #endif
-
        /* set the limit of our address space */
        xen_reserve_top();
 
+       /* We used to do this in xen_arch_setup, but that is too late on AMD
+        * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
+        * which pokes 0xcf8 port.
+        */
+       set_iopl.iopl = 1;
+       rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+       if (rc != 0)
+               xen_raw_printk("physdev_op failed %d\n", rc);
+
 #ifdef CONFIG_X86_32
        /* set up basic CPUID stuff */
        cpu_detect(&new_cpu_data);
@@ -1221,12 +1385,24 @@ asmlinkage void __init xen_start_kernel(void)
                add_preferred_console("xenboot", 0, NULL);
                add_preferred_console("tty", 0, NULL);
                add_preferred_console("hvc", 0, NULL);
+               if (pci_xen)
+                       x86_init.pci.arch_init = pci_xen_init;
        } else {
+               const struct dom0_vga_console_info *info =
+                       (void *)((char *)xen_start_info +
+                                xen_start_info->console.dom0.info_off);
+
+               xen_init_vga(info, xen_start_info->console.dom0.info_size);
+               xen_start_info->console.domU.mfn = 0;
+               xen_start_info->console.domU.evtchn = 0;
+
                /* Make sure ACS will be enabled */
                pci_request_acs();
        }
-               
-
+#ifdef CONFIG_PCI
+       /* PCI BIOS service won't work from a PV guest. */
+       pci_probe &= ~PCI_PROBE_BIOS;
+#endif
        xen_raw_console_write("about to get started...\n");
 
        xen_setup_runstate_info(0);
@@ -1238,3 +1414,133 @@ asmlinkage void __init xen_start_kernel(void)
        x86_64_start_reservations((char *)__pa_symbol(&boot_params));
 #endif
 }
+
+static int init_hvm_pv_info(int *major, int *minor)
+{
+       uint32_t eax, ebx, ecx, edx, pages, msr, base;
+       u64 pfn;
+
+       base = xen_cpuid_base();
+       cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+       *major = eax >> 16;
+       *minor = eax & 0xffff;
+       printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+
+       cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+       pfn = __pa(hypercall_page);
+       wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+       xen_setup_features();
+
+       pv_info.name = "Xen HVM";
+
+       xen_domain_type = XEN_HVM_DOMAIN;
+
+       return 0;
+}
+
+void __ref xen_hvm_init_shared_info(void)
+{
+       int cpu;
+       struct xen_add_to_physmap xatp;
+       static struct shared_info *shared_info_page = 0;
+
+       if (!shared_info_page)
+               shared_info_page = (struct shared_info *)
+                       extend_brk(PAGE_SIZE, PAGE_SIZE);
+       xatp.domid = DOMID_SELF;
+       xatp.idx = 0;
+       xatp.space = XENMAPSPACE_shared_info;
+       xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+       if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+               BUG();
+
+       HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+
+       /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+        * page, we use it in the event channel upcall and in some pvclock
+        * related functions. We don't need the vcpu_info placement
+        * optimizations because we don't use any pv_mmu or pv_irq op on
+        * HVM.
+        * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
+        * online but xen_hvm_init_shared_info is run at resume time too and
+        * in that case multiple vcpus might be online. */
+       for_each_online_cpu(cpu) {
+               per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+       }
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
+                                   unsigned long action, void *hcpu)
+{
+       int cpu = (long)hcpu;
+       switch (action) {
+       case CPU_UP_PREPARE:
+               xen_vcpu_setup(cpu);
+               if (xen_have_vector_callback)
+                       xen_init_lock_cpu(cpu);
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
+       .notifier_call  = xen_hvm_cpu_notify,
+};
+
+static void __init xen_hvm_guest_init(void)
+{
+       int r;
+       int major, minor;
+
+       r = init_hvm_pv_info(&major, &minor);
+       if (r < 0)
+               return;
+
+       xen_hvm_init_shared_info();
+
+       if (xen_feature(XENFEAT_hvm_callback_vector))
+               xen_have_vector_callback = 1;
+       xen_hvm_smp_init();
+       register_cpu_notifier(&xen_hvm_cpu_notifier);
+       xen_unplug_emulated_devices();
+       x86_init.irqs.intr_init = xen_init_IRQ;
+       xen_hvm_init_time_ops();
+       xen_hvm_init_mmu_ops();
+}
+
+static bool __init xen_hvm_platform(void)
+{
+       if (xen_pv_domain())
+               return false;
+
+       if (!xen_cpuid_base())
+               return false;
+
+       return true;
+}
+
+bool xen_hvm_need_lapic(void)
+{
+       if (xen_pv_domain())
+               return false;
+       if (!xen_hvm_domain())
+               return false;
+       if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+               return false;
+       return true;
+}
+EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
+
+const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
+       .name                   = "Xen HVM",
+       .detect                 = xen_hvm_platform,
+       .init_platform          = xen_hvm_guest_init,
+};
+EXPORT_SYMBOL(x86_hyper_xen_hvm);
+#endif