- Update to 3.4-rc7.

[linux-flexiantxendom0-3.2.10.git] / arch / x86 / xen / enlighten.c
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index 02c710b..6fbd402 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -62,6 +62,16 @@
  #include <asm/reboot.h>
  #include <asm/stackprotector.h>
  #include <asm/hypervisor.h>
+#include <asm/mwait.h>
+#include <asm/pci_x86.h>
+
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#include <asm/acpi.h>
+#include <acpi/pdc_intel.h>
+#include <acpi/processor.h>
+#include <xen/interface/platform.h>
+#endif
  
  #include "xen-ops.h"
  #include "mmu.h"
@@ -77,8 +87,8 @@ EXPORT_SYMBOL_GPL(xen_domain_type);
  
  unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
  EXPORT_SYMBOL(machine_to_phys_mapping);
-unsigned int   machine_to_phys_order;
-EXPORT_SYMBOL(machine_to_phys_order);
+unsigned long  machine_to_phys_nr;
+EXPORT_SYMBOL(machine_to_phys_nr);
  
  struct start_info *xen_start_info;
  EXPORT_SYMBOL_GPL(xen_start_info);
@@ -115,8 +125,8 @@ static int have_vcpu_info_placement = 1;
  static void clamp_max_cpus(void)
  {
  #ifdef CONFIG_SMP
-       if (setup_max_cpus > MAX_VIRT_CPUS)
-               setup_max_cpus = MAX_VIRT_CPUS;
+       if (setup_max_cpus > XEN_LEGACY_MAX_VCPUS)
+               setup_max_cpus = XEN_LEGACY_MAX_VCPUS;
  #endif
  }
  
@@ -128,11 +138,11 @@ static void xen_vcpu_setup(int cpu)
  
         BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
  
-       if (cpu < MAX_VIRT_CPUS)
+       if (cpu < XEN_LEGACY_MAX_VCPUS)
                 per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
  
         if (!have_vcpu_info_placement) {
-               if (cpu >= MAX_VIRT_CPUS)
+               if (cpu >= XEN_LEGACY_MAX_VCPUS)
                         clamp_max_cpus();
                 return;
         }
@@ -200,13 +210,17 @@ static void __init xen_banner(void)
  static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
  static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
  
+static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
+static __read_mostly unsigned int cpuid_leaf5_ecx_val;
+static __read_mostly unsigned int cpuid_leaf5_edx_val;
+
  static void xen_cpuid(unsigned int *ax, unsigned int *bx,
                       unsigned int *cx, unsigned int *dx)
  {
         unsigned maskebx = ~0;
         unsigned maskecx = ~0;
         unsigned maskedx = ~0;
-
+       unsigned setecx = 0;
         /*
          * Mask out inconvenient features, to try and disable as many
          * unsupported kernel subsystems as possible.
@@ -214,9 +228,18 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
         switch (*ax) {
         case 1:
                 maskecx = cpuid_leaf1_ecx_mask;
+               setecx = cpuid_leaf1_ecx_set_mask;
                 maskedx = cpuid_leaf1_edx_mask;
                 break;
  
+       case CPUID_MWAIT_LEAF:
+               /* Synthesize the values.. */
+               *ax = 0;
+               *bx = 0;
+               *cx = cpuid_leaf5_ecx_val;
+               *dx = cpuid_leaf5_edx_val;
+               return;
+
         case 0xb:
                 /* Suppress extended topology stuff */
                 maskebx = 0;
@@ -232,12 +255,80 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
  
         *bx &= maskebx;
         *cx &= maskecx;
+       *cx |= setecx;
         *dx &= maskedx;
+
  }
  
-static __init void xen_init_cpuid_mask(void)
+static bool __init xen_check_mwait(void)
  {
+#if defined(CONFIG_ACPI) && !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR) && \
+       !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR_MODULE)
+       struct xen_platform_op op = {
+               .cmd                    = XENPF_set_processor_pminfo,
+               .u.set_pminfo.id        = -1,
+               .u.set_pminfo.type      = XEN_PM_PDC,
+       };
+       uint32_t buf[3];
         unsigned int ax, bx, cx, dx;
+       unsigned int mwait_mask;
+
+       /* We need to determine whether it is OK to expose the MWAIT
+        * capability to the kernel to harvest deeper than C3 states from ACPI
+        * _CST using the processor_harvest_xen.c module. For this to work, we
+        * need to gather the MWAIT_LEAF values (which the cstate.c code
+        * checks against). The hypervisor won't expose the MWAIT flag because
+        * it would break backwards compatibility; so we will find out directly
+        * from the hardware and hypercall.
+        */
+       if (!xen_initial_domain())
+               return false;
+
+       ax = 1;
+       cx = 0;
+
+       native_cpuid(&ax, &bx, &cx, &dx);
+
+       mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
+                    (1 << (X86_FEATURE_MWAIT % 32));
+
+       if ((cx & mwait_mask) != mwait_mask)
+               return false;
+
+       /* We need to emulate the MWAIT_LEAF and for that we need both
+        * ecx and edx. The hypercall provides only partial information.
+        */
+
+       ax = CPUID_MWAIT_LEAF;
+       bx = 0;
+       cx = 0;
+       dx = 0;
+
+       native_cpuid(&ax, &bx, &cx, &dx);
+
+       /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+        * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
+        */
+       buf[0] = ACPI_PDC_REVISION_ID;
+       buf[1] = 1;
+       buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+
+       set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
+
+       if ((HYPERVISOR_dom0_op(&op) == 0) &&
+           (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+               cpuid_leaf5_ecx_val = cx;
+               cpuid_leaf5_edx_val = dx;
+       }
+       return true;
+#else
+       return false;
+#endif
+}
+static void __init xen_init_cpuid_mask(void)
+{
+       unsigned int ax, bx, cx, dx;
+       unsigned int xsave_mask;
  
         cpuid_leaf1_edx_mask =
                 ~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
@@ -249,24 +340,19 @@ static __init void xen_init_cpuid_mask(void)
                 cpuid_leaf1_edx_mask &=
                         ~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
                           (1 << X86_FEATURE_ACPI));  /* disable ACPI */
-
         ax = 1;
         cx = 0;
         xen_cpuid(&ax, &bx, &cx, &dx);
  
-       /* cpuid claims we support xsave; try enabling it to see what happens */
-       if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
-               unsigned long cr4;
+       xsave_mask =
+               (1 << (X86_FEATURE_XSAVE % 32)) |
+               (1 << (X86_FEATURE_OSXSAVE % 32));
  
-               set_in_cr4(X86_CR4_OSXSAVE);
-               
-               cr4 = read_cr4();
-
-               if ((cr4 & X86_CR4_OSXSAVE) == 0)
-                       cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
-
-               clear_in_cr4(X86_CR4_OSXSAVE);
-       }
+       /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+       if ((cx & xsave_mask) != xsave_mask)
+               cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
+       if (xen_check_mwait())
+               cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
  }
  
  static void xen_set_debugreg(int reg, unsigned long val)
@@ -348,6 +434,8 @@ static void xen_set_ldt(const void *addr, unsigned entries)
         struct mmuext_op *op;
         struct multicall_space mcs = xen_mc_entry(sizeof(*op));
  
+       trace_xen_cpu_set_ldt(addr, entries);
+
         op = mcs.args;
         op->cmd = MMUEXT_SET_LDT;
         op->arg1.linear_addr = (unsigned long)addr;
@@ -407,7 +495,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
  /*
   * load_gdt for early boot, when the gdt is only mapped once
   */
-static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
  {
         unsigned long va = dtr->address;
         unsigned int size = dtr->size + 1;
@@ -503,6 +591,8 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
         xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
         u64 entry = *(u64 *)ptr;
  
+       trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
+
         preempt_disable();
  
         xen_mc_flush();
@@ -572,10 +662,12 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
         unsigned long p = (unsigned long)&dt[entrynum];
         unsigned long start, end;
  
+       trace_xen_cpu_write_idt_entry(dt, entrynum, g);
+
         preempt_disable();
  
-       start = __get_cpu_var(idt_desc).address;
-       end = start + __get_cpu_var(idt_desc).size + 1;
+       start = __this_cpu_read(idt_desc.address);
+       end = start + __this_cpu_read(idt_desc.size) + 1;
  
         xen_mc_flush();
  
@@ -626,6 +718,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
         static DEFINE_SPINLOCK(lock);
         static struct trap_info traps[257];
  
+       trace_xen_cpu_load_idt(desc);
+
         spin_lock(&lock);
  
         __get_cpu_var(idt_desc) = *desc;
@@ -644,6 +738,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
  static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
                                 const void *desc, int type)
  {
+       trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
         preempt_disable();
  
         switch (type) {
@@ -669,9 +765,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
   * Version of write_gdt_entry for use at early boot-time needed to
   * update an entry as simply as possible.
   */
-static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
                                             const void *desc, int type)
  {
+       trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
         switch (type) {
         case DESC_LDT:
         case DESC_TSS:
@@ -691,7 +789,9 @@ static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
  static void xen_load_sp0(struct tss_struct *tss,
                          struct thread_struct *thread)
  {
-       struct multicall_space mcs = xen_mc_entry(0);
+       struct multicall_space mcs;
+
+       mcs = xen_mc_entry(0);
         MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
         xen_mc_issue(PARAVIRT_LAZY_CPU);
  }
@@ -710,9 +810,40 @@ static void xen_io_delay(void)
  }
  
  #ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_set_apic_id(unsigned int x)
+{
+       WARN_ON(1);
+       return x;
+}
+static unsigned int xen_get_apic_id(unsigned long x)
+{
+       return ((x)>>24) & 0xFFu;
+}
  static u32 xen_apic_read(u32 reg)
  {
-       return 0;
+       struct xen_platform_op op = {
+               .cmd = XENPF_get_cpuinfo,
+               .interface_version = XENPF_INTERFACE_VERSION,
+               .u.pcpu_info.xen_cpuid = 0,
+       };
+       int ret = 0;
+
+       /* Shouldn't need this as APIC is turned off for PV, and we only
+        * get called on the bootup processor. But just in case. */
+       if (!xen_initial_domain() || smp_processor_id())
+               return 0;
+
+       if (reg == APIC_LVR)
+               return 0x10;
+
+       if (reg != APIC_ID)
+               return 0;
+
+       ret = HYPERVISOR_dom0_op(&op);
+       if (ret)
+               return 0;
+
+       return op.u.pcpu_info.apic_id << 24;
  }
  
  static void xen_apic_write(u32 reg, u32 val)
@@ -750,6 +881,8 @@ static void set_xen_basic_apic_ops(void)
         apic->icr_write = xen_apic_icr_write;
         apic->wait_icr_idle = xen_apic_wait_icr_idle;
         apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
+       apic->set_apic_id = xen_set_apic_id;
+       apic->get_apic_id = xen_get_apic_id;
  }
  
  #endif
@@ -769,11 +902,11 @@ static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
  
  static unsigned long xen_read_cr0(void)
  {
-       unsigned long cr0 = percpu_read(xen_cr0_value);
+       unsigned long cr0 = this_cpu_read(xen_cr0_value);
  
         if (unlikely(cr0 == 0)) {
                 cr0 = native_read_cr0();
-               percpu_write(xen_cr0_value, cr0);
+               this_cpu_write(xen_cr0_value, cr0);
         }
  
         return cr0;
@@ -783,7 +916,7 @@ static void xen_write_cr0(unsigned long cr0)
  {
         struct multicall_space mcs;
  
-       percpu_write(xen_cr0_value, cr0);
+       this_cpu_write(xen_cr0_value, cr0);
  
         /* Only pay attention to cr0.TS; everything else is
            ignored. */
@@ -868,7 +1001,7 @@ void xen_setup_shared_info(void)
         xen_setup_mfn_list_list();
  }
  
-/* This is called once we have the cpu_possible_map */
+/* This is called once we have the cpu_possible_mask */
  void xen_setup_vcpu_info_placement(void)
  {
         int cpu;
@@ -940,18 +1073,22 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
         return ret;
  }
  
-static const struct pv_info xen_info __initdata = {
+static const struct pv_info xen_info __initconst = {
         .paravirt_enabled = 1,
         .shared_kernel_pmd = 0,
  
+#ifdef CONFIG_X86_64
+       .extra_user_64bit_cs = FLAT_USER_CS64,
+#endif
+
         .name = "Xen",
  };
  
-static const struct pv_init_ops xen_init_ops __initdata = {
+static const struct pv_init_ops xen_init_ops __initconst = {
         .patch = xen_patch,
  };
  
-static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
         .cpuid = xen_cpuid,
  
         .set_debugreg = xen_set_debugreg,
@@ -1011,7 +1148,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
         .end_context_switch = xen_end_context_switch,
  };
  
-static const struct pv_apic_ops xen_apic_ops __initdata = {
+static const struct pv_apic_ops xen_apic_ops __initconst = {
  #ifdef CONFIG_X86_LOCAL_APIC
         .startup_ipi_hook = paravirt_nop,
  #endif
@@ -1021,10 +1158,6 @@ static void xen_reboot(int reason)
  {
         struct sched_shutdown r = { .reason = reason };
  
-#ifdef CONFIG_SMP
-       stop_other_cpus();
-#endif
-
         if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
                 BUG();
  }
@@ -1044,6 +1177,13 @@ static void xen_machine_halt(void)
         xen_reboot(SHUTDOWN_poweroff);
  }
  
+static void xen_machine_power_off(void)
+{
+       if (pm_power_off)
+               pm_power_off();
+       xen_reboot(SHUTDOWN_poweroff);
+}
+
  static void xen_crash_shutdown(struct pt_regs *regs)
  {
         xen_reboot(SHUTDOWN_crash);
@@ -1066,10 +1206,10 @@ int xen_panic_handler_init(void)
         return 0;
  }
  
-static const struct machine_ops __initdata xen_machine_ops = {
+static const struct machine_ops xen_machine_ops __initconst = {
         .restart = xen_restart,
         .halt = xen_machine_halt,
-       .power_off = xen_machine_halt,
+       .power_off = xen_machine_power_off,
         .shutdown = xen_machine_halt,
         .crash_shutdown = xen_crash_shutdown,
         .emergency_restart = xen_emergency_restart,
@@ -1126,7 +1266,9 @@ asmlinkage void __init xen_start_kernel(void)
  
         /* Prevent unwanted bits from being set in PTEs. */
         __supported_pte_mask &= ~_PAGE_GLOBAL;
+#if 0
         if (!xen_initial_domain())
+#endif
                 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
  
         __supported_pte_mask |= _PAGE_IOMAP;
@@ -1178,20 +1320,23 @@ asmlinkage void __init xen_start_kernel(void)
  
         xen_smp_init();
  
-       pgd = (pgd_t *)xen_start_info->pt_base;
+#ifdef CONFIG_ACPI_NUMA
+       /*
+        * The pages we from Xen are not related to machine pages, so
+        * any NUMA information the kernel tries to get from ACPI will
+        * be meaningless.  Prevent it from trying.
+        */
+       acpi_numa = -1;
+#endif
  
-       if (!xen_initial_domain())
-               __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+       pgd = (pgd_t *)xen_start_info->pt_base;
  
-       __supported_pte_mask |= _PAGE_IOMAP;
         /* Don't do the full vcpu_info placement stuff until we have a
            possible map and a non-dummy shared_info. */
         per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
  
         local_irq_disable();
-       early_boot_irqs_off();
-
-       memblock_init();
+       early_boot_irqs_disabled = true;
  
         xen_raw_console_write("mapping kernel into physical memory\n");
         pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
@@ -1243,11 +1388,21 @@ asmlinkage void __init xen_start_kernel(void)
                 if (pci_xen)
                         x86_init.pci.arch_init = pci_xen_init;
         } else {
+               const struct dom0_vga_console_info *info =
+                       (void *)((char *)xen_start_info +
+                                xen_start_info->console.dom0.info_off);
+
+               xen_init_vga(info, xen_start_info->console.dom0.info_size);
+               xen_start_info->console.domU.mfn = 0;
+               xen_start_info->console.domU.evtchn = 0;
+
                 /* Make sure ACS will be enabled */
                 pci_request_acs();
         }
-               
-
+#ifdef CONFIG_PCI
+       /* PCI BIOS service won't work from a PV guest. */
+       pci_probe &= ~PCI_PROBE_BIOS;
+#endif
         xen_raw_console_write("about to get started...\n");
  
         xen_setup_runstate_info(0);
@@ -1260,25 +1415,6 @@ asmlinkage void __init xen_start_kernel(void)
  #endif
  }
  
-static uint32_t xen_cpuid_base(void)
-{
-       uint32_t base, eax, ebx, ecx, edx;
-       char signature[13];
-
-       for (base = 0x40000000; base < 0x40010000; base += 0x100) {
-               cpuid(base, &eax, &ebx, &ecx, &edx);
-               *(uint32_t *)(signature + 0) = ebx;
-               *(uint32_t *)(signature + 4) = ecx;
-               *(uint32_t *)(signature + 8) = edx;
-               signature[12] = 0;
-
-               if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
-                       return base;
-       }
-
-       return 0;
-}
-
  static int init_hvm_pv_info(int *major, int *minor)
  {
         uint32_t eax, ebx, ecx, edx, pages, msr, base;
@@ -1298,15 +1434,14 @@ static int init_hvm_pv_info(int *major, int *minor)
  
         xen_setup_features();
  
-       pv_info = xen_info;
-       pv_info.kernel_rpl = 0;
+       pv_info.name = "Xen HVM";
  
         xen_domain_type = XEN_HVM_DOMAIN;
  
         return 0;
  }
  
-void xen_hvm_init_shared_info(void)
+void __ref xen_hvm_init_shared_info(void)
  {
         int cpu;
         struct xen_add_to_physmap xatp;
@@ -1344,7 +1479,9 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
         int cpu = (long)hcpu;
         switch (action) {
         case CPU_UP_PREPARE:
-               per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+               xen_vcpu_setup(cpu);
+               if (xen_have_vector_callback)
+                       xen_init_lock_cpu(cpu);
                 break;
         default:
                 break;
@@ -1352,7 +1489,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
         return NOTIFY_OK;
  }
  
-static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
         .notifier_call  = xen_hvm_cpu_notify,
  };
  
@@ -1369,9 +1506,9 @@ static void __init xen_hvm_guest_init(void)
  
         if (xen_feature(XENFEAT_hvm_callback_vector))
                 xen_have_vector_callback = 1;
+       xen_hvm_smp_init();
         register_cpu_notifier(&xen_hvm_cpu_notifier);
         xen_unplug_emulated_devices();
-       have_vcpu_info_placement = 0;
         x86_init.irqs.intr_init = xen_init_IRQ;
         xen_hvm_init_time_ops();
         xen_hvm_init_mmu_ops();
@@ -1388,7 +1525,19 @@ static bool __init xen_hvm_platform(void)
         return true;
  }
  
-const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+bool xen_hvm_need_lapic(void)
+{
+       if (xen_pv_domain())
+               return false;
+       if (!xen_hvm_domain())
+               return false;
+       if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+               return false;
+       return true;
+}
+EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
+
+const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
         .name                   = "Xen HVM",
         .detect                 = xen_hvm_platform,
         .init_platform          = xen_hvm_guest_init,