- Update to 3.4-rc7.

[linux-flexiantxendom0-3.2.10.git] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index d99976e..7e0f8e1 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -57,6 +57,7 @@
  #include <asm/mtrr.h>
  #include <asm/mce.h>
  #include <asm/i387.h>
+#include <asm/fpu-internal.h> /* Ugh! */
  #include <asm/xcr.h>
  #include <asm/pvclock.h>
  #include <asm/div64.h>
@@ -88,14 +89,18 @@ static void process_nmi(struct kvm_vcpu *vcpu);
  struct kvm_x86_ops *kvm_x86_ops;
  EXPORT_SYMBOL_GPL(kvm_x86_ops);
  
-int ignore_msrs = 0;
-module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+static bool ignore_msrs = 0;
+module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
  
  bool kvm_has_tsc_control;
  EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
  u32  kvm_max_guest_tsc_khz;
  EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
  
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+
  #define KVM_NR_SHARED_MSRS 16
  
  struct kvm_shared_msrs_global {
@@ -760,6 +765,21 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
  }
  EXPORT_SYMBOL_GPL(kvm_get_dr);
  
+bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+{
+       u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+       u64 data;
+       int err;
+
+       err = kvm_pmu_read_pmc(vcpu, ecx, &data);
+       if (err)
+               return err;
+       kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
+       kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
+       return err;
+}
+EXPORT_SYMBOL_GPL(kvm_rdpmc);
+
  /*
   * List of msr numbers which we expose to userspace through KVM_GET_MSRS
   * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -953,50 +973,51 @@ static inline u64 get_kernel_ns(void)
  static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
  unsigned long max_tsc_khz;
  
-static inline int kvm_tsc_changes_freq(void)
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
  {
-       int cpu = get_cpu();
-       int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
-                 cpufreq_quick_get(cpu) != 0;
-       put_cpu();
-       return ret;
+       return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+                                  vcpu->arch.virtual_tsc_shift);
  }
  
-u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+static u32 adjust_tsc_khz(u32 khz, s32 ppm)
  {
-       if (vcpu->arch.virtual_tsc_khz)
-               return vcpu->arch.virtual_tsc_khz;
-       else
-               return __this_cpu_read(cpu_tsc_khz);
+       u64 v = (u64)khz * (1000000 + ppm);
+       do_div(v, 1000000);
+       return v;
  }
  
-static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
  {
-       u64 ret;
+       u32 thresh_lo, thresh_hi;
+       int use_scaling = 0;
  
-       WARN_ON(preemptible());
-       if (kvm_tsc_changes_freq())
-               printk_once(KERN_WARNING
-                "kvm: unreliable cycle conversion on adjustable rate TSC\n");
-       ret = nsec * vcpu_tsc_khz(vcpu);
-       do_div(ret, USEC_PER_SEC);
-       return ret;
-}
-
-static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
-{
         /* Compute a scale to convert nanoseconds in TSC cycles */
         kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
-                          &vcpu->arch.tsc_catchup_shift,
-                          &vcpu->arch.tsc_catchup_mult);
+                          &vcpu->arch.virtual_tsc_shift,
+                          &vcpu->arch.virtual_tsc_mult);
+       vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+
+       /*
+        * Compute the variation in TSC rate which is acceptable
+        * within the range of tolerance and decide if the
+        * rate being applied is within that bounds of the hardware
+        * rate.  If so, no scaling or compensation need be done.
+        */
+       thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+       thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+       if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
+               pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+               use_scaling = 1;
+       }
+       kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
  }
  
  static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
  {
-       u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
-                                     vcpu->arch.tsc_catchup_mult,
-                                     vcpu->arch.tsc_catchup_shift);
-       tsc += vcpu->arch.last_tsc_write;
+       u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
+                                     vcpu->arch.virtual_tsc_mult,
+                                     vcpu->arch.virtual_tsc_shift);
+       tsc += vcpu->arch.this_tsc_write;
         return tsc;
  }
  
@@ -1005,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
         struct kvm *kvm = vcpu->kvm;
         u64 offset, ns, elapsed;
         unsigned long flags;
-       s64 sdiff;
+       s64 usdiff;
  
         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
         offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
         ns = get_kernel_ns();
         elapsed = ns - kvm->arch.last_tsc_nsec;
-       sdiff = data - kvm->arch.last_tsc_write;
-       if (sdiff < 0)
-               sdiff = -sdiff;
+
+       /* n.b - signed multiplication and division required */
+       usdiff = data - kvm->arch.last_tsc_write;
+#ifdef CONFIG_X86_64
+       usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+#else
+       /* do_div() only does unsigned */
+       asm("idivl %2; xor %%edx, %%edx"
+           : "=A"(usdiff)
+           : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+#endif
+       do_div(elapsed, 1000);
+       usdiff -= elapsed;
+       if (usdiff < 0)
+               usdiff = -usdiff;
  
         /*
-        * Special case: close write to TSC within 5 seconds of
-        * another CPU is interpreted as an attempt to synchronize
-        * The 5 seconds is to accommodate host load / swapping as
-        * well as any reset of TSC during the boot process.
-        *
-        * In that case, for a reliable TSC, we can match TSC offsets,
-        * or make a best guest using elapsed value.
-        */
-       if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
-           elapsed < 5ULL * NSEC_PER_SEC) {
+        * Special case: TSC write with a small delta (1 second) of virtual
+        * cycle time against real time is interpreted as an attempt to
+        * synchronize the CPU.
+         *
+        * For a reliable TSC, we can match TSC offsets, and for an unstable
+        * TSC, we add elapsed time in this computation.  We could let the
+        * compensation code attempt to catch up if we fall behind, but
+        * it's better to try to match offsets from the beginning.
+         */
+       if (usdiff < USEC_PER_SEC &&
+           vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
                 if (!check_tsc_unstable()) {
-                       offset = kvm->arch.last_tsc_offset;
+                       offset = kvm->arch.cur_tsc_offset;
                         pr_debug("kvm: matched tsc offset for %llu\n", data);
                 } else {
                         u64 delta = nsec_to_cycles(vcpu, elapsed);
-                       offset += delta;
+                       data += delta;
+                       offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
                         pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
                 }
-               ns = kvm->arch.last_tsc_nsec;
+       } else {
+               /*
+                * We split periods of matched TSC writes into generations.
+                * For each generation, we track the original measured
+                * nanosecond time, offset, and write, so if TSCs are in
+                * sync, we can match exact offset, and if not, we can match
+                * exact software computaion in compute_guest_tsc()
+                *
+                * These values are tracked in kvm->arch.cur_xxx variables.
+                */
+               kvm->arch.cur_tsc_generation++;
+               kvm->arch.cur_tsc_nsec = ns;
+               kvm->arch.cur_tsc_write = data;
+               kvm->arch.cur_tsc_offset = offset;
+               pr_debug("kvm: new tsc generation %u, clock %llu\n",
+                        kvm->arch.cur_tsc_generation, data);
         }
+
+       /*
+        * We also track th most recent recorded KHZ, write and time to
+        * allow the matching interval to be extended at each write.
+        */
         kvm->arch.last_tsc_nsec = ns;
         kvm->arch.last_tsc_write = data;
-       kvm->arch.last_tsc_offset = offset;
-       kvm_x86_ops->write_tsc_offset(vcpu, offset);
-       raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+       kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
  
         /* Reset of TSC must disable overshoot protection below */
         vcpu->arch.hv_clock.tsc_timestamp = 0;
-       vcpu->arch.last_tsc_write = data;
-       vcpu->arch.last_tsc_nsec = ns;
+       vcpu->arch.last_guest_tsc = data;
+
+       /* Keep track of which generation this VCPU has synchronized to */
+       vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+       vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+       vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+       kvm_x86_ops->write_tsc_offset(vcpu, offset);
+       raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
  }
+
  EXPORT_SYMBOL_GPL(kvm_write_tsc);
  
  static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -1062,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         local_irq_save(flags);
         tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
         kernel_ns = get_kernel_ns();
-       this_tsc_khz = vcpu_tsc_khz(v);
+       this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
         if (unlikely(this_tsc_khz == 0)) {
                 local_irq_restore(flags);
                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1082,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         if (vcpu->tsc_catchup) {
                 u64 tsc = compute_guest_tsc(v, kernel_ns);
                 if (tsc > tsc_timestamp) {
-                       kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
+                       adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
                         tsc_timestamp = tsc;
                 }
         }
@@ -1114,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
          * observed by the guest and ensure the new system time is greater.
          */
         max_kernel_ns = 0;
-       if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+       if (vcpu->hv_clock.tsc_timestamp) {
                 max_kernel_ns = vcpu->last_guest_tsc -
                                 vcpu->hv_clock.tsc_timestamp;
                 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
@@ -1147,12 +1208,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
          */
         vcpu->hv_clock.version += 2;
  
-       shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+       shared_kaddr = kmap_atomic(vcpu->time_page);
  
         memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
                sizeof(vcpu->hv_clock));
  
-       kunmap_atomic(shared_kaddr, KM_USER0);
+       kunmap_atomic(shared_kaddr);
  
         mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
         return 0;
@@ -1309,12 +1370,11 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
         if (page_num >= blob_size)
                 goto out;
         r = -ENOMEM;
-       page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-       if (!page)
+       page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
+       if (IS_ERR(page)) {
+               r = PTR_ERR(page);
                 goto out;
-       r = -EFAULT;
-       if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
-               goto out_free;
+       }
         if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
                 goto out_free;
         r = 0;
@@ -1481,12 +1541,15 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
  
  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
  {
+       bool pr = false;
+
         switch (msr) {
         case MSR_EFER:
                 return set_efer(vcpu, data);
         case MSR_K7_HWCR:
                 data &= ~(u64)0x40;     /* ignore flush filter disable */
                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
+               data &= ~(u64)0x8;      /* ignore TLB cache disable */
                 if (data != 0) {
                         pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
                                 data);
@@ -1519,6 +1582,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
         case MSR_VM_HSAVE_PA:
         case MSR_AMD64_PATCH_LOADER:
                 break;
+       case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
         case 0x200 ... 0x2ff:
                 return set_msr_mtrr(vcpu, msr, data);
         case MSR_IA32_APICBASE:
@@ -1603,8 +1667,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
          * which we perfectly emulate ;-). Any other value should be at least
          * reported, some guests depend on them.
          */
-       case MSR_P6_EVNTSEL0:
-       case MSR_P6_EVNTSEL1:
         case MSR_K7_EVNTSEL0:
         case MSR_K7_EVNTSEL1:
         case MSR_K7_EVNTSEL2:
@@ -1616,8 +1678,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
         /* at least RHEL 4 unconditionally writes to the perfctr registers,
          * so we ignore writes to make it happy.
          */
-       case MSR_P6_PERFCTR0:
-       case MSR_P6_PERFCTR1:
         case MSR_K7_PERFCTR0:
         case MSR_K7_PERFCTR1:
         case MSR_K7_PERFCTR2:
@@ -1625,6 +1685,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
                         "0x%x data 0x%llx\n", msr, data);
                 break;
+       case MSR_P6_PERFCTR0:
+       case MSR_P6_PERFCTR1:
+               pr = true;
+       case MSR_P6_EVNTSEL0:
+       case MSR_P6_EVNTSEL1:
+               if (kvm_pmu_msr(vcpu, msr))
+                       return kvm_pmu_set_msr(vcpu, msr, data);
+
+               if (pr || data != 0)
+                       pr_unimpl(vcpu, "disabled perfctr wrmsr: "
+                               "0x%x data 0x%llx\n", msr, data);
+               break;
         case MSR_K7_CLK_CTL:
                 /*
                  * Ignore all writes to this no longer documented MSR.
@@ -1651,9 +1723,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                  */
                 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
                 break;
+       case MSR_AMD64_OSVW_ID_LENGTH:
+               if (!guest_cpuid_has_osvw(vcpu))
+                       return 1;
+               vcpu->arch.osvw.length = data;
+               break;
+       case MSR_AMD64_OSVW_STATUS:
+               if (!guest_cpuid_has_osvw(vcpu))
+                       return 1;
+               vcpu->arch.osvw.status = data;
+               break;
         default:
                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
                         return xen_hvm_config(vcpu, data);
+               if (kvm_pmu_msr(vcpu, msr))
+                       return kvm_pmu_set_msr(vcpu, msr, data);
                 if (!ignore_msrs) {
                         pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
                                 msr, data);
@@ -1816,15 +1900,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         case MSR_K8_SYSCFG:
         case MSR_K7_HWCR:
         case MSR_VM_HSAVE_PA:
-       case MSR_P6_PERFCTR0:
-       case MSR_P6_PERFCTR1:
-       case MSR_P6_EVNTSEL0:
-       case MSR_P6_EVNTSEL1:
         case MSR_K7_EVNTSEL0:
         case MSR_K7_PERFCTR0:
         case MSR_K8_INT_PENDING_MSG:
         case MSR_AMD64_NB_CFG:
         case MSR_FAM10H_MMIO_CONF_BASE:
+       case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
+               data = 0;
+               break;
+       case MSR_P6_PERFCTR0:
+       case MSR_P6_PERFCTR1:
+       case MSR_P6_EVNTSEL0:
+       case MSR_P6_EVNTSEL1:
+               if (kvm_pmu_msr(vcpu, msr))
+                       return kvm_pmu_get_msr(vcpu, msr, pdata);
                 data = 0;
                 break;
         case MSR_IA32_UCODE_REV:
@@ -1929,7 +2018,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
                  */
                 data = 0xbe702111;
                 break;
+       case MSR_AMD64_OSVW_ID_LENGTH:
+               if (!guest_cpuid_has_osvw(vcpu))
+                       return 1;
+               data = vcpu->arch.osvw.length;
+               break;
+       case MSR_AMD64_OSVW_STATUS:
+               if (!guest_cpuid_has_osvw(vcpu))
+                       return 1;
+               data = vcpu->arch.osvw.status;
+               break;
         default:
+               if (kvm_pmu_msr(vcpu, msr))
+                       return kvm_pmu_get_msr(vcpu, msr, pdata);
                 if (!ignore_msrs) {
                         pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
                         return 1;
@@ -1988,15 +2089,12 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
         if (msrs.nmsrs >= MAX_IO_MSRS)
                 goto out;
  
-       r = -ENOMEM;
         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
-       entries = kmalloc(size, GFP_KERNEL);
-       if (!entries)
+       entries = memdup_user(user_msrs->entries, size);
+       if (IS_ERR(entries)) {
+               r = PTR_ERR(entries);
                 goto out;
-
-       r = -EFAULT;
-       if (copy_from_user(entries, user_msrs->entries, size))
-               goto out_free;
+       }
  
         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
         if (r < 0)
@@ -2050,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext)
         case KVM_CAP_XSAVE:
         case KVM_CAP_ASYNC_PF:
         case KVM_CAP_GET_TSC_KHZ:
+       case KVM_CAP_PCI_2_3:
                 r = 1;
                 break;
         case KVM_CAP_COALESCED_MMIO:
@@ -2184,19 +2283,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
         }
  
         kvm_x86_ops->vcpu_load(vcpu, cpu);
-       if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
-               /* Make sure TSC doesn't go backwards */
-               s64 tsc_delta;
-               u64 tsc;
  
-               tsc = kvm_x86_ops->read_l1_tsc(vcpu);
-               tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
-                            tsc - vcpu->arch.last_guest_tsc;
+       /* Apply any externally detected TSC adjustments (due to suspend) */
+       if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+               adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+               vcpu->arch.tsc_offset_adjustment = 0;
+               set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+       }
  
+       if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+               s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+                               native_read_tsc() - vcpu->arch.last_host_tsc;
                 if (tsc_delta < 0)
                         mark_tsc_unstable("KVM discovered backwards TSC");
                 if (check_tsc_unstable()) {
-                       kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
+                       u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+                                               vcpu->arch.last_guest_tsc);
+                       kvm_x86_ops->write_tsc_offset(vcpu, offset);
                         vcpu->arch.tsc_catchup = 1;
                 }
                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2213,7 +2316,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  {
         kvm_x86_ops->vcpu_put(vcpu);
         kvm_put_guest_fpu(vcpu);
-       vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+       vcpu->arch.last_host_tsc = native_read_tsc();
  }
  
  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2533,13 +2636,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 r = -EINVAL;
                 if (!vcpu->arch.apic)
                         goto out;
-               u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
-               r = -ENOMEM;
-               if (!u.lapic)
-                       goto out;
-               r = -EFAULT;
-               if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
+               u.lapic = memdup_user(argp, sizeof(*u.lapic));
+               if (IS_ERR(u.lapic)) {
+                       r = PTR_ERR(u.lapic);
                         goto out;
+               }
+
                 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
                 if (r)
                         goto out;
@@ -2718,14 +2820,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 break;
         }
         case KVM_SET_XSAVE: {
-               u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
-               r = -ENOMEM;
-               if (!u.xsave)
-                       break;
-
-               r = -EFAULT;
-               if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
-                       break;
+               u.xsave = memdup_user(argp, sizeof(*u.xsave));
+               if (IS_ERR(u.xsave)) {
+                       r = PTR_ERR(u.xsave);
+                       goto out;
+               }
  
                 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
                 break;
@@ -2746,15 +2845,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 break;
         }
         case KVM_SET_XCRS: {
-               u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
-               r = -ENOMEM;
-               if (!u.xcrs)
-                       break;
-
-               r = -EFAULT;
-               if (copy_from_user(u.xcrs, argp,
-                                  sizeof(struct kvm_xcrs)))
-                       break;
+               u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
+               if (IS_ERR(u.xcrs)) {
+                       r = PTR_ERR(u.xcrs);
+                       goto out;
+               }
  
                 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
                 break;
@@ -2763,26 +2858,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 u32 user_tsc_khz;
  
                 r = -EINVAL;
-               if (!kvm_has_tsc_control)
-                       break;
-
                 user_tsc_khz = (u32)arg;
  
                 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
                         goto out;
  
-               kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
+               if (user_tsc_khz == 0)
+                       user_tsc_khz = tsc_khz;
+
+               kvm_set_tsc_khz(vcpu, user_tsc_khz);
  
                 r = 0;
                 goto out;
         }
         case KVM_GET_TSC_KHZ: {
-               r = -EIO;
-               if (check_tsc_unstable())
-                       goto out;
-
-               r = vcpu_tsc_khz(vcpu);
-
+               r = vcpu->arch.virtual_tsc_khz;
                 goto out;
         }
         default:
@@ -2793,6 +2883,11 @@ out:
         return r;
  }
  
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+       return VM_FAULT_SIGBUS;
+}
+
  static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
  {
         int ret;
@@ -2976,6 +3071,8 @@ static void write_protect_slot(struct kvm *kvm,
                                unsigned long *dirty_bitmap,
                                unsigned long nr_dirty_pages)
  {
+       spin_lock(&kvm->mmu_lock);
+
         /* Not many dirty pages compared to # of shadow pages. */
         if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
                 unsigned long gfn_offset;
@@ -2983,16 +3080,13 @@ static void write_protect_slot(struct kvm *kvm,
                 for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
                         unsigned long gfn = memslot->base_gfn + gfn_offset;
  
-                       spin_lock(&kvm->mmu_lock);
                         kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
-                       spin_unlock(&kvm->mmu_lock);
                 }
                 kvm_flush_remote_tlbs(kvm);
-       } else {
-               spin_lock(&kvm->mmu_lock);
+       } else
                 kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-               spin_unlock(&kvm->mmu_lock);
-       }
+
+       spin_unlock(&kvm->mmu_lock);
  }
  
  /*
@@ -3031,10 +3125,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                 memset(dirty_bitmap_head, 0, n);
  
                 r = -ENOMEM;
-               slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+               slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
                 if (!slots)
                         goto out;
-               memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+
                 memslot = id_to_memslot(slots, log->slot);
                 memslot->nr_dirty_pages = 0;
                 memslot->dirty_bitmap = dirty_bitmap_head;
@@ -3111,6 +3205,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 r = -EEXIST;
                 if (kvm->arch.vpic)
                         goto create_irqchip_unlock;
+               r = -EINVAL;
+               if (atomic_read(&kvm->online_vcpus))
+                       goto create_irqchip_unlock;
                 r = -ENOMEM;
                 vpic = kvm_create_pic(kvm);
                 if (vpic) {
@@ -3190,14 +3287,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
         }
         case KVM_GET_IRQCHIP: {
                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-               struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+               struct kvm_irqchip *chip;
  
-               r = -ENOMEM;
-               if (!chip)
+               chip = memdup_user(argp, sizeof(*chip));
+               if (IS_ERR(chip)) {
+                       r = PTR_ERR(chip);
                         goto out;
-               r = -EFAULT;
-               if (copy_from_user(chip, argp, sizeof *chip))
-                       goto get_irqchip_out;
+               }
+
                 r = -ENXIO;
                 if (!irqchip_in_kernel(kvm))
                         goto get_irqchip_out;
@@ -3216,14 +3313,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
         }
         case KVM_SET_IRQCHIP: {
                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-               struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+               struct kvm_irqchip *chip;
  
-               r = -ENOMEM;
-               if (!chip)
+               chip = memdup_user(argp, sizeof(*chip));
+               if (IS_ERR(chip)) {
+                       r = PTR_ERR(chip);
                         goto out;
-               r = -EFAULT;
-               if (copy_from_user(chip, argp, sizeof *chip))
-                       goto set_irqchip_out;
+               }
+
                 r = -ENXIO;
                 if (!irqchip_in_kernel(kvm))
                         goto set_irqchip_out;
@@ -3827,7 +3924,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
                 goto emul_write;
         }
  
-       kaddr = kmap_atomic(page, KM_USER0);
+       kaddr = kmap_atomic(page);
         kaddr += offset_in_page(gpa);
         switch (bytes) {
         case 1:
@@ -3845,7 +3942,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
         default:
                 BUG();
         }
-       kunmap_atomic(kaddr, KM_USER0);
+       kunmap_atomic(kaddr);
         kvm_release_page_dirty(page);
  
         if (!exchanged)
@@ -4041,6 +4138,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
         return res;
  }
  
+static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val)
+{
+       kvm_set_rflags(emul_to_vcpu(ctxt), val);
+}
+
  static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
  {
         return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
@@ -4147,6 +4249,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
         return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
  }
  
+static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
+                            u32 pmc, u64 *pdata)
+{
+       return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
+}
+
  static void emulator_halt(struct x86_emulate_ctxt *ctxt)
  {
         emul_to_vcpu(ctxt)->arch.halt_request = 1;
@@ -4175,6 +4283,28 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
         return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
  }
  
+static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+                              u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
+{
+       struct kvm_cpuid_entry2 *cpuid = NULL;
+
+       if (eax && ecx)
+               cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
+                                           *eax, *ecx);
+
+       if (cpuid) {
+               *eax = cpuid->eax;
+               *ecx = cpuid->ecx;
+               if (ebx)
+                       *ebx = cpuid->ebx;
+               if (edx)
+                       *edx = cpuid->edx;
+               return true;
+       }
+
+       return false;
+}
+
  static struct x86_emulate_ops emulate_ops = {
         .read_std            = kvm_read_guest_virt_system,
         .write_std           = kvm_write_guest_virt_system,
@@ -4194,17 +4324,20 @@ static struct x86_emulate_ops emulate_ops = {
         .set_idt             = emulator_set_idt,
         .get_cr              = emulator_get_cr,
         .set_cr              = emulator_set_cr,
+       .set_rflags          = emulator_set_rflags,
         .cpl                 = emulator_get_cpl,
         .get_dr              = emulator_get_dr,
         .set_dr              = emulator_set_dr,
         .set_msr             = emulator_set_msr,
         .get_msr             = emulator_get_msr,
+       .read_pmc            = emulator_read_pmc,
         .halt                = emulator_halt,
         .wbinvd              = emulator_wbinvd,
         .fix_hypercall       = emulator_fix_hypercall,
         .get_fpu             = emulator_get_fpu,
         .put_fpu             = emulator_put_fpu,
         .intercept           = emulator_intercept,
+       .get_cpuid           = emulator_get_cpuid,
  };
  
  static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4662,17 +4795,17 @@ static void kvm_timer_init(void)
  
  static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
  
-static int kvm_is_in_guest(void)
+int kvm_is_in_guest(void)
  {
-       return percpu_read(current_vcpu) != NULL;
+       return __this_cpu_read(current_vcpu) != NULL;
  }
  
  static int kvm_is_user_mode(void)
  {
         int user_mode = 3;
  
-       if (percpu_read(current_vcpu))
-               user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+       if (__this_cpu_read(current_vcpu))
+               user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
  
         return user_mode != 0;
  }
@@ -4681,8 +4814,8 @@ static unsigned long kvm_get_guest_ip(void)
  {
         unsigned long ip = 0;
  
-       if (percpu_read(current_vcpu))
-               ip = kvm_rip_read(percpu_read(current_vcpu));
+       if (__this_cpu_read(current_vcpu))
+               ip = kvm_rip_read(__this_cpu_read(current_vcpu));
  
         return ip;
  }
@@ -4695,13 +4828,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
  
  void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
  {
-       percpu_write(current_vcpu, vcpu);
+       __this_cpu_write(current_vcpu, vcpu);
  }
  EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
  
  void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
  {
-       percpu_write(current_vcpu, NULL);
+       __this_cpu_write(current_vcpu, NULL);
  }
  EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
  
@@ -5126,6 +5259,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                         process_nmi(vcpu);
                 req_immediate_exit =
                         kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
+               if (kvm_check_request(KVM_REQ_PMU, vcpu))
+                       kvm_handle_pmu_event(vcpu);
+               if (kvm_check_request(KVM_REQ_PMI, vcpu))
+                       kvm_deliver_pmi(vcpu);
         }
  
         r = kvm_mmu_reload(vcpu);
@@ -5232,6 +5369,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 profile_hit(KVM_PROFILING, (void *)rip);
         }
  
+       if (unlikely(vcpu->arch.tsc_always_catchup))
+               kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
  
         kvm_lapic_sync_from_vapic(vcpu);
  
@@ -5531,15 +5670,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
         return 0;
  }
  
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
-                   bool has_error_code, u32 error_code)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+                   int reason, bool has_error_code, u32 error_code)
  {
         struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
         int ret;
  
         init_emulate_ctxt(vcpu);
  
-       ret = emulator_task_switch(ctxt, tss_selector, reason,
+       ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
                                    has_error_code, error_code);
  
         if (ret)
@@ -5862,6 +6001,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
         kvm_async_pf_hash_reset(vcpu);
         vcpu->arch.apf.halted = false;
  
+       kvm_pmu_reset(vcpu);
+
         return kvm_x86_ops->vcpu_reset(vcpu);
  }
  
@@ -5870,13 +6011,88 @@ int kvm_arch_hardware_enable(void *garbage)
         struct kvm *kvm;
         struct kvm_vcpu *vcpu;
         int i;
+       int ret;
+       u64 local_tsc;
+       u64 max_tsc = 0;
+       bool stable, backwards_tsc = false;
  
         kvm_shared_msr_cpu_online();
-       list_for_each_entry(kvm, &vm_list, vm_list)
-               kvm_for_each_vcpu(i, vcpu, kvm)
-                       if (vcpu->cpu == smp_processor_id())
-                               kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-       return kvm_x86_ops->hardware_enable(garbage);
+       ret = kvm_x86_ops->hardware_enable(garbage);
+       if (ret != 0)
+               return ret;
+
+       local_tsc = native_read_tsc();
+       stable = !check_tsc_unstable();
+       list_for_each_entry(kvm, &vm_list, vm_list) {
+               kvm_for_each_vcpu(i, vcpu, kvm) {
+                       if (!stable && vcpu->cpu == smp_processor_id())
+                               set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+                       if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+                               backwards_tsc = true;
+                               if (vcpu->arch.last_host_tsc > max_tsc)
+                                       max_tsc = vcpu->arch.last_host_tsc;
+                       }
+               }
+       }
+
+       /*
+        * Sometimes, even reliable TSCs go backwards.  This happens on
+        * platforms that reset TSC during suspend or hibernate actions, but
+        * maintain synchronization.  We must compensate.  Fortunately, we can
+        * detect that condition here, which happens early in CPU bringup,
+        * before any KVM threads can be running.  Unfortunately, we can't
+        * bring the TSCs fully up to date with real time, as we aren't yet far
+        * enough into CPU bringup that we know how much real time has actually
+        * elapsed; our helper function, get_kernel_ns() will be using boot
+        * variables that haven't been updated yet.
+        *
+        * So we simply find the maximum observed TSC above, then record the
+        * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
+        * the adjustment will be applied.  Note that we accumulate
+        * adjustments, in case multiple suspend cycles happen before some VCPU
+        * gets a chance to run again.  In the event that no KVM threads get a
+        * chance to run, we will miss the entire elapsed period, as we'll have
+        * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+        * loose cycle time.  This isn't too big a deal, since the loss will be
+        * uniform across all VCPUs (not to mention the scenario is extremely
+        * unlikely). It is possible that a second hibernate recovery happens
+        * much faster than a first, causing the observed TSC here to be
+        * smaller; this would require additional padding adjustment, which is
+        * why we set last_host_tsc to the local tsc observed here.
+        *
+        * N.B. - this code below runs only on platforms with reliable TSC,
+        * as that is the only way backwards_tsc is set above.  Also note
+        * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+        * have the same delta_cyc adjustment applied if backwards_tsc
+        * is detected.  Note further, this adjustment is only done once,
+        * as we reset last_host_tsc on all VCPUs to stop this from being
+        * called multiple times (one for each physical CPU bringup).
+        *
+        * Platforms with unnreliable TSCs don't have to deal with this, they
+        * will be compensated by the logic in vcpu_load, which sets the TSC to
+        * catchup mode.  This will catchup all VCPUs to real time, but cannot
+        * guarantee that they stay in perfect synchronization.
+        */
+       if (backwards_tsc) {
+               u64 delta_cyc = max_tsc - local_tsc;
+               list_for_each_entry(kvm, &vm_list, vm_list) {
+                       kvm_for_each_vcpu(i, vcpu, kvm) {
+                               vcpu->arch.tsc_offset_adjustment += delta_cyc;
+                               vcpu->arch.last_host_tsc = local_tsc;
+                       }
+
+                       /*
+                        * We have to disable TSC offset matching.. if you were
+                        * booting a VM while issuing an S4 host suspend....
+                        * you may have some problem.  Solving this issue is
+                        * left as an exercise to the reader.
+                        */
+                       kvm->arch.last_tsc_nsec = 0;
+                       kvm->arch.last_tsc_write = 0;
+               }
+
+       }
+       return 0;
  }
  
  void kvm_arch_hardware_disable(void *garbage)
@@ -5900,6 +6116,11 @@ void kvm_arch_check_processor_compat(void *rtn)
         kvm_x86_ops->check_processor_compatibility(rtn);
  }
  
+bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
+{
+       return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+}
+
  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  {
         struct page *page;
@@ -5922,7 +6143,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
         }
         vcpu->arch.pio_data = page_address(page);
  
-       kvm_init_tsc_catchup(vcpu, max_tsc_khz);
+       kvm_set_tsc_khz(vcpu, max_tsc_khz);
  
         r = kvm_mmu_create(vcpu);
         if (r < 0)
@@ -5946,6 +6167,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                 goto fail_free_mce_banks;
  
         kvm_async_pf_hash_reset(vcpu);
+       kvm_pmu_init(vcpu);
  
         return 0;
  fail_free_mce_banks:
@@ -5964,6 +6186,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
  {
         int idx;
  
+       kvm_pmu_destroy(vcpu);
         kfree(vcpu->arch.mce_banks);
         kvm_free_lapic(vcpu);
         idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -5972,8 +6195,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
         free_page((unsigned long)vcpu->arch.pio_data);
  }
  
-int kvm_arch_init_vm(struct kvm *kvm)
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  {
+       if (type)
+               return -EINVAL;
+
         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
         INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
  
@@ -6033,6 +6259,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
                 put_page(kvm->arch.ept_identity_pagetable);
  }
  
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+                          struct kvm_memory_slot *dont)
+{
+       int i;
+
+       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+               if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
+                       vfree(free->arch.lpage_info[i]);
+                       free->arch.lpage_info[i] = NULL;
+               }
+       }
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+       int i;
+
+       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+               unsigned long ugfn;
+               int lpages;
+               int level = i + 2;
+
+               lpages = gfn_to_index(slot->base_gfn + npages - 1,
+                                     slot->base_gfn, level) + 1;
+
+               slot->arch.lpage_info[i] =
+                       vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
+               if (!slot->arch.lpage_info[i])
+                       goto out_free;
+
+               if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+                       slot->arch.lpage_info[i][0].write_count = 1;
+               if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+                       slot->arch.lpage_info[i][lpages - 1].write_count = 1;
+               ugfn = slot->userspace_addr >> PAGE_SHIFT;
+               /*
+                * If the gfn and userspace address are not aligned wrt each
+                * other, or if explicitly asked to, disable large page
+                * support for this slot
+                */
+               if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+                   !kvm_largepages_enabled()) {
+                       unsigned long j;
+
+                       for (j = 0; j < lpages; ++j)
+                               slot->arch.lpage_info[i][j].write_count = 1;
+               }
+       }
+
+       return 0;
+
+out_free:
+       for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+               vfree(slot->arch.lpage_info[i]);
+               slot->arch.lpage_info[i] = NULL;
+       }
+       return -ENOMEM;
+}
+
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot,
                                 struct kvm_memory_slot old,
@@ -6053,13 +6338,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                 if (npages && !old.rmap) {
                         unsigned long userspace_addr;
  
-                       down_write(&current->mm->mmap_sem);
-                       userspace_addr = do_mmap(NULL, 0,
+                       userspace_addr = vm_mmap(NULL, 0,
                                                  npages * PAGE_SIZE,
                                                  PROT_READ | PROT_WRITE,
                                                  map_flags,
                                                  0);
-                       up_write(&current->mm->mmap_sem);
  
                         if (IS_ERR((void *)userspace_addr))
                                 return PTR_ERR((void *)userspace_addr);
@@ -6083,10 +6366,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
                 int ret;
  
-               down_write(&current->mm->mmap_sem);
-               ret = do_munmap(current->mm, old.userspace_addr,
+               ret = vm_munmap(old.userspace_addr,
                                 old.npages * PAGE_SIZE);
-               up_write(&current->mm->mmap_sem);
                 if (ret < 0)
                         printk(KERN_WARNING
                                "kvm_vm_ioctl_set_memory_region: "
@@ -6302,6 +6583,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
                 kvm_inject_page_fault(vcpu, &fault);
         }
         vcpu->arch.apf.halted = false;
+       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
  }
  
  bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)