- Update to 3.4-rc7.

[linux-flexiantxendom0-3.2.10.git] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 4e9bd23..7e0f8e1 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -57,6 +57,7 @@
  #include <asm/mtrr.h>
  #include <asm/mce.h>
  #include <asm/i387.h>
+#include <asm/fpu-internal.h> /* Ugh! */
  #include <asm/xcr.h>
  #include <asm/pvclock.h>
  #include <asm/div64.h>
@@ -1013,10 +1014,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
  
  static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
  {
-       u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
+       u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
                                       vcpu->arch.virtual_tsc_mult,
                                       vcpu->arch.virtual_tsc_shift);
-       tsc += vcpu->arch.last_tsc_write;
+       tsc += vcpu->arch.this_tsc_write;
         return tsc;
  }
  
@@ -1025,7 +1026,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
         struct kvm *kvm = vcpu->kvm;
         u64 offset, ns, elapsed;
         unsigned long flags;
-       s64 nsdiff;
+       s64 usdiff;
  
         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
         offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1033,18 +1034,19 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
         elapsed = ns - kvm->arch.last_tsc_nsec;
  
         /* n.b - signed multiplication and division required */
-       nsdiff = data - kvm->arch.last_tsc_write;
+       usdiff = data - kvm->arch.last_tsc_write;
  #ifdef CONFIG_X86_64
-       nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+       usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
  #else
         /* do_div() only does unsigned */
         asm("idivl %2; xor %%edx, %%edx"
-           : "=A"(nsdiff)
-           : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+           : "=A"(usdiff)
+           : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
  #endif
-       nsdiff -= elapsed;
-       if (nsdiff < 0)
-               nsdiff = -nsdiff;
+       do_div(elapsed, 1000);
+       usdiff -= elapsed;
+       if (usdiff < 0)
+               usdiff = -usdiff;
  
         /*
          * Special case: TSC write with a small delta (1 second) of virtual
@@ -1056,10 +1058,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
          * compensation code attempt to catch up if we fall behind, but
          * it's better to try to match offsets from the beginning.
           */
-       if (nsdiff < NSEC_PER_SEC &&
+       if (usdiff < USEC_PER_SEC &&
             vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
                 if (!check_tsc_unstable()) {
-                       offset = kvm->arch.last_tsc_offset;
+                       offset = kvm->arch.cur_tsc_offset;
                         pr_debug("kvm: matched tsc offset for %llu\n", data);
                 } else {
                         u64 delta = nsec_to_cycles(vcpu, elapsed);
@@ -1067,20 +1069,45 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
                         offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
                         pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
                 }
+       } else {
+               /*
+                * We split periods of matched TSC writes into generations.
+                * For each generation, we track the original measured
+                * nanosecond time, offset, and write, so if TSCs are in
+                * sync, we can match exact offset, and if not, we can match
+                * exact software computaion in compute_guest_tsc()
+                *
+                * These values are tracked in kvm->arch.cur_xxx variables.
+                */
+               kvm->arch.cur_tsc_generation++;
+               kvm->arch.cur_tsc_nsec = ns;
+               kvm->arch.cur_tsc_write = data;
+               kvm->arch.cur_tsc_offset = offset;
+               pr_debug("kvm: new tsc generation %u, clock %llu\n",
+                        kvm->arch.cur_tsc_generation, data);
         }
+
+       /*
+        * We also track th most recent recorded KHZ, write and time to
+        * allow the matching interval to be extended at each write.
+        */
         kvm->arch.last_tsc_nsec = ns;
         kvm->arch.last_tsc_write = data;
-       kvm->arch.last_tsc_offset = offset;
         kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
-       kvm_x86_ops->write_tsc_offset(vcpu, offset);
-       raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  
         /* Reset of TSC must disable overshoot protection below */
         vcpu->arch.hv_clock.tsc_timestamp = 0;
-       vcpu->arch.last_tsc_write = data;
-       vcpu->arch.last_tsc_nsec = ns;
         vcpu->arch.last_guest_tsc = data;
+
+       /* Keep track of which generation this VCPU has synchronized to */
+       vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+       vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+       vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  }
+
  EXPORT_SYMBOL_GPL(kvm_write_tsc);
  
  static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1181,12 +1208,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
          */
         vcpu->hv_clock.version += 2;
  
-       shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+       shared_kaddr = kmap_atomic(vcpu->time_page);
  
         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
                sizeof(vcpu->hv_clock));
  
-       kunmap_atomic(shared_kaddr, KM_USER0);
+       kunmap_atomic(shared_kaddr);
  
         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
         return 0;
@@ -1522,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
         case MSR_K7_HWCR:
                 data &= ~(u64)0x40;     /* ignore flush filter disable */
                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
+               data &= ~(u64)0x8;      /* ignore TLB cache disable */
                 if (data != 0) {
                         pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
                                 data);
@@ -1554,6 +1582,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
         case MSR_VM_HSAVE_PA:
         case MSR_AMD64_PATCH_LOADER:
                 break;
+       case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
         case 0x200 ... 0x2ff:
                 return set_msr_mtrr(vcpu, msr, data);
         case MSR_IA32_APICBASE:
@@ -1876,6 +1905,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         case MSR_K8_INT_PENDING_MSG:
         case MSR_AMD64_NB_CFG:
         case MSR_FAM10H_MMIO_CONF_BASE:
+       case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
                 data = 0;
                 break;
         case MSR_P6_PERFCTR0:
@@ -2118,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
         case KVM_CAP_XSAVE:
         case KVM_CAP_ASYNC_PF:
         case KVM_CAP_GET_TSC_KHZ:
+       case KVM_CAP_PCI_2_3:
                 r = 1;
                 break;
         case KVM_CAP_COALESCED_MMIO:
@@ -3040,6 +3071,8 @@ static void write_protect_slot(struct kvm *kvm,
                                unsigned long *dirty_bitmap,
                                unsigned long nr_dirty_pages)
  {
+       spin_lock(&kvm->mmu_lock);
+
         /* Not many dirty pages compared to # of shadow pages. */
         if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
                 unsigned long gfn_offset;
@@ -3047,16 +3080,13 @@ static void write_protect_slot(struct kvm *kvm,
                 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
                         unsigned long gfn = memslot->base_gfn + gfn_offset;
  
-                       spin_lock(&kvm->mmu_lock);
                         kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-                       spin_unlock(&kvm->mmu_lock);
                 }
                 kvm_flush_remote_tlbs(kvm);
-       } else {
-               spin_lock(&kvm->mmu_lock);
+       } else
                 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-               spin_unlock(&kvm->mmu_lock);
-       }
+
+       spin_unlock(&kvm->mmu_lock);
  }
  
  /*
@@ -3175,6 +3205,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 r = -EEXIST;
                 if (kvm->arch.vpic)
                         goto create_irqchip_unlock;
+               r = -EINVAL;
+               if (atomic_read(&kvm->online_vcpus))
+                       goto create_irqchip_unlock;
                 r = -ENOMEM;
                 vpic = kvm_create_pic(kvm);
                 if (vpic) {
@@ -3891,7 +3924,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                 goto emul_write;
         }
  
-       kaddr = kmap_atomic(page, KM_USER0);
+       kaddr = kmap_atomic(page);
         kaddr += offset_in_page(gpa);
         switch (bytes) {
         case 1:
@@ -3909,7 +3942,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
         default:
                 BUG();
         }
-       kunmap_atomic(kaddr, KM_USER0);
+       kunmap_atomic(kaddr);
         kvm_release_page_dirty(page);
  
         if (!exchanged)
@@ -4105,6 +4138,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
         return res;
  }
  
+static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
+{
+       kvm_set_rflags(emul_to_vcpu(ctxt), val);
+}
+
  static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
  {
         return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4286,6 +4324,7 @@ static struct x86_emulate_ops emulate_ops = {
         .set_idt             = emulator_set_idt,
         .get_cr              = emulator_get_cr,
         .set_cr              = emulator_set_cr,
+       .set_rflags          = emulator_set_rflags,
         .cpl                 = emulator_get_cpl,
         .get_dr              = emulator_get_dr,
         .set_dr              = emulator_set_dr,
@@ -5631,15 +5670,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
         return 0;
  }
  
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
-                   bool has_error_code, u32 error_code)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+                   int reason, bool has_error_code, u32 error_code)
  {
         struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
         int ret;
  
         init_emulate_ctxt(vcpu);
  
-       ret = emulator_task_switch(ctxt, tss_selector, reason,
+       ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
                                    has_error_code, error_code);
  
         if (ret)
@@ -6077,6 +6116,11 @@ void kvm_arch_check_processor_compat(void *rtn)
         kvm_x86_ops->check_processor_compatibility(rtn);
  }
  
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+       return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+}
+
  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  {
         struct page *page;
@@ -6215,6 +6259,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                 put_page(kvm->arch.ept_identity_pagetable);
  }
  
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+                          struct kvm_memory_slot *dont)
+{
+       int i;
+
+       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+               if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
+                       vfree(free->arch.lpage_info[i]);
+                       free->arch.lpage_info[i] = NULL;
+               }
+       }
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+       int i;
+
+       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+               unsigned long ugfn;
+               int lpages;
+               int level = i + 2;
+
+               lpages = gfn_to_index(slot->base_gfn + npages - 1,
+                                     slot->base_gfn, level) + 1;
+
+               slot->arch.lpage_info[i] =
+                       vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
+               if (!slot->arch.lpage_info[i])
+                       goto out_free;
+
+               if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+                       slot->arch.lpage_info[i][0].write_count = 1;
+               if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+                       slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+               ugfn = slot->userspace_addr >> PAGE_SHIFT;
+               /*
+                * If the gfn and userspace address are not aligned wrt each
+                * other, or if explicitly asked to, disable large page
+                * support for this slot
+                */
+               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+                   !kvm_largepages_enabled()) {
+                       unsigned long j;
+
+                       for (j = 0; j < lpages; ++j)
+                               slot->arch.lpage_info[i][j].write_count = 1;
+               }
+       }
+
+       return 0;
+
+out_free:
+       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+               vfree(slot->arch.lpage_info[i]);
+               slot->arch.lpage_info[i] = NULL;
+       }
+       return -ENOMEM;
+}
+
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot,
                                 struct kvm_memory_slot old,
@@ -6235,13 +6338,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                 if (npages && !old.rmap) {
                         unsigned long userspace_addr;
  
-                       down_write(&current->mm->mmap_sem);
-                       userspace_addr = do_mmap(NULL, 0,
+                       userspace_addr = vm_mmap(NULL, 0,
                                                  npages * PAGE_SIZE,
                                                  PROT_READ | PROT_WRITE,
                                                  map_flags,
                                                  0);
-                       up_write(&current->mm->mmap_sem);
  
                         if (IS_ERR((void *)userspace_addr))
                                 return PTR_ERR((void *)userspace_addr);
@@ -6265,10 +6366,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
                 int ret;
  
-               down_write(&current->mm->mmap_sem);
-               ret = do_munmap(current->mm, old.userspace_addr,
+               ret = vm_munmap(old.userspace_addr,
                                 old.npages * PAGE_SIZE);
-               up_write(&current->mm->mmap_sem);
                 if (ret < 0)
                         printk(KERN_WARNING
                                "kvm_vm_ioctl_set_memory_region: "
@@ -6484,6 +6583,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
                 kvm_inject_page_fault(vcpu, &fault);
         }
         vcpu->arch.apf.halted = false;
+       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  }
  
  bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)