- Update Xen patches to 2.6.33 and c/s 1003.

author Jan Beulich <jbeulich@novell.com>

Tue, 2 Mar 2010 16:27:17 +0000 (17:27 +0100)

committer Jan Beulich <jbeulich@novell.com>

Tue, 2 Mar 2010 16:27:17 +0000 (17:27 +0100)
author Jan Beulich <jbeulich@novell.com>
Tue, 2 Mar 2010 16:27:17 +0000 (17:27 +0100)
committer Jan Beulich <jbeulich@novell.com>
Tue, 2 Mar 2010 16:27:17 +0000 (17:27 +0100)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index d61e425..812121a 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -78,7 +78,6 @@ config CLOCKSOURCE_WATCHDOG
  
  config GENERIC_CLOCKEVENTS
         def_bool y
-       depends on !XEN
  
  config GENERIC_CLOCKEVENTS_BROADCAST
         def_bool y
diff --git a/arch/x86/include/mach-xen/asm/hypervisor.h b/arch/x86/include/mach-xen/asm/hypervisor.h

index fb728c6..edd1b82 100644 (file)
--- a/arch/x86/include/mach-xen/asm/hypervisor.h
+++ b/arch/x86/include/mach-xen/asm/hypervisor.h
@@ -86,8 +86,6 @@ extern start_info_t *xen_start_info;
  #define init_hypervisor(c) ((void)((c)->x86_hyper_vendor = X86_HYPER_VENDOR_XEN))
  #define init_hypervisor_platform() init_hypervisor(&boot_cpu_data)
  
-struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu);
-
  /* arch/xen/kernel/evtchn.c */
  /* Force a proper event-channel callback from Xen. */
  void force_evtchn_callback(void);
diff --git a/arch/x86/kernel/time-xen.c b/arch/x86/kernel/time-xen.c

index e628ec6..05c3b97 100644 (file)
--- a/arch/x86/kernel/time-xen.c
+++ b/arch/x86/kernel/time-xen.c
@@ -25,7 +25,7 @@
  #include <asm/time.h>
  #include <asm/timer.h>
  
-#include <xen/evtchn.h>
+#include <xen/clock.h>
  #include <xen/sysctl.h>
  #include <xen/interface/vcpu.h>
  
@@ -55,17 +55,7 @@ static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
  static struct timespec shadow_tv;
  static u32 shadow_tv_version;
  
-/* Keep track of last time we did processing/updating of jiffies and xtime. */
-static u64 processed_system_time;   /* System time (ns) at last processing. */
-
-struct local_time_info {
-       u64 processed_system;
-       u64 accounted_system;
-       /* How much CPU time was spent blocked and how much was 'stolen'? */
-       u64 accounted_stolen;
-       u64 accounted_blocked;
-};
-static DEFINE_PER_CPU(struct local_time_info, local_time);
+static u64 jiffies_bias, system_time_bias;
  
  /* Current runstate of each CPU (updated automatically by the hypervisor). */
  DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
@@ -73,10 +63,6 @@ DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
  /* Must be signed, as it's compared with s64 quantities which can be -ve. */
  #define NS_PER_TICK (1000000000LL/HZ)
  
-static struct vcpu_set_periodic_timer xen_set_periodic_tick = {
-       .period_ns = NS_PER_TICK
-};
-
  static void __clock_was_set(struct work_struct *unused)
  {
         clock_was_set();
@@ -127,19 +113,6 @@ static int __init __permitted_clock_jitter(char *str)
  __setup("permitted_clock_jitter=", __permitted_clock_jitter);
  
  /*
- * Limit on the number of CPUs that may concurrently attempt to acquire
- * xtime_lock in timer_interrupt() (reducing contention potentially leading
- * to a live lock on systems with many CPUs.
- */
-static unsigned int __read_mostly duty_limit = -2;
-static int __init set_duty_limit(char *str)
-{
-       duty_limit = simple_strtoul(str, NULL, 0) - 1;
-       return 1;
-}
-__setup("timer_duty_limit=", set_duty_limit);
-
-/*
   * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
   * yielding a 64-bit result.
   */
@@ -222,6 +195,11 @@ static u64 get_nsec_offset(struct shadow_time_info *shadow)
         return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
  }
  
+static inline u64 processed_system_time(void)
+{
+       return (jiffies_64 - jiffies_bias) * NS_PER_TICK + system_time_bias;
+}
+
  static void __update_wallclock(time_t sec, long nsec)
  {
         long wtm_nsec, xtime_nsec;
@@ -229,7 +207,7 @@ static void __update_wallclock(time_t sec, long nsec)
         u64 tmp, wc_nsec;
  
         /* Adjust wall-clock time base. */
-       wc_nsec = processed_system_time;
+       wc_nsec = processed_system_time();
         wc_nsec += sec * (u64)NSEC_PER_SEC;
         wc_nsec += nsec;
  
@@ -261,6 +239,17 @@ static void update_wallclock(void)
                 __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec);
  }
  
+void xen_check_wallclock_update(void)
+{
+       if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
+               write_seqlock(&xtime_lock);
+               update_wallclock();
+               write_sequnlock(&xtime_lock);
+               if (keventd_up())
+                       schedule_work(&clock_was_set_work);
+       }
+}
+
  /*
   * Reads a consistent set of time-base values from Xen, into a shadow data
   * area.
@@ -320,7 +309,7 @@ static void sync_xen_wallclock(unsigned long dummy)
         op.cmd = XENPF_settime;
         op.u.settime.secs        = sec;
         op.u.settime.nsecs       = nsec;
-       op.u.settime.system_time = processed_system_time;
+       op.u.settime.system_time = processed_system_time();
         WARN_ON(HYPERVISOR_platform_op(&op));
  
         update_wallclock();
@@ -331,7 +320,7 @@ static void sync_xen_wallclock(unsigned long dummy)
         mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
  }
  
-static unsigned long long local_clock(void)
+unsigned long long xen_local_clock(void)
  {
         unsigned int cpu = get_cpu();
         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
@@ -355,7 +344,7 @@ static unsigned long long local_clock(void)
  /*
   * Runstate accounting
   */
-static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+void get_runstate_snapshot(struct vcpu_runstate_info *res)
  {
         u64 state_time;
         struct vcpu_runstate_info *state;
@@ -391,7 +380,7 @@ unsigned long long sched_clock(void)
          */
         preempt_disable();
  
-       now = local_clock();
+       now = xen_local_clock();
  
         get_runstate_snapshot(&runstate);
  
@@ -434,169 +423,6 @@ unsigned long profile_pc(struct pt_regs *regs)
  }
  EXPORT_SYMBOL(profile_pc);
  
-/*
- * Default timer interrupt handler
- */
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-       static unsigned int contention_count;
-       s64 delta, delta_cpu, stolen, blocked;
-       unsigned int i, cpu = smp_processor_id();
-       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
-       struct local_time_info *local = &per_cpu(local_time, cpu);
-       bool duty = false;
-       struct vcpu_runstate_info runstate;
-
-       /* Keep nmi watchdog up to date */
-       inc_irq_stat(irq0_irqs);
-
-       /*
-        * Here we are in the timer irq handler. We just have irqs locally
-        * disabled but we don't know if the timer_bh is running on the other
-        * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-        * the irq version of write_lock because as just said we have irq
-        * locally disabled. -arca
-        */
-       asm (LOCK_PREFIX "xaddl %1, %0"
-            : "+m" (contention_count), "=r" (i) : "1" (1));
-       if (i <= duty_limit) {
-               write_seqlock(&xtime_lock);
-               duty = true;
-       }
-       asm (LOCK_PREFIX "decl %0" : "+m" (contention_count));
-
-       do {
-               get_time_values_from_xen(cpu);
-
-               /* Obtain a consistent snapshot of elapsed wallclock cycles. */
-               delta = delta_cpu =
-                       shadow->system_timestamp + get_nsec_offset(shadow);
-               delta     -= processed_system_time;
-               delta_cpu -= local->processed_system;
-
-               get_runstate_snapshot(&runstate);
-       } while (!time_values_up_to_date());
-
-       if (duty && unlikely(delta < -(s64)permitted_clock_jitter)) {
-               blocked = processed_system_time;
-               write_sequnlock(&xtime_lock);
-               if (printk_ratelimit()) {
-                       printk("Timer ISR/%u: Time went backwards: "
-                              "delta=%Ld/%Ld shadow=%Lx off=%Lx "
-                              "processed=%Lx/%Lx\n",
-                              cpu, delta, delta_cpu, shadow->system_timestamp,
-                              get_nsec_offset(shadow), blocked,
-                              local->processed_system);
-                       for_each_cpu_and(i, cpu_online_mask, cpumask_of(cpu))
-                               printk(" %u: %Lx\n", i,
-                                      per_cpu(local_time.processed_system, i));
-               }
-       } else if (unlikely(delta_cpu < -(s64)permitted_clock_jitter)) {
-               blocked = processed_system_time;
-               if (duty)
-                       write_sequnlock(&xtime_lock);
-               if (printk_ratelimit()) {
-                       printk("Timer ISR/%u: Time went backwards: delta=%Ld"
-                              " shadow=%Lx off=%Lx processed=%Lx/%Lx\n",
-                              cpu, delta_cpu, shadow->system_timestamp,
-                              get_nsec_offset(shadow), blocked,
-                              local->processed_system);
-                       for_each_cpu_and(i, cpu_online_mask, cpumask_of(cpu))
-                               printk(" %u: %Lx\n", i,
-                                      per_cpu(local_time.processed_system, i));
-               }
-       } else if (duty) {
-               /* System-wide jiffy work. */
-               if (delta >= NS_PER_TICK) {
-                       do_div(delta, NS_PER_TICK);
-                       processed_system_time += delta * NS_PER_TICK;
-                       while (delta > HZ) {
-                               clobber_induction_variable(delta);
-                               do_timer(HZ);
-                               delta -= HZ;
-                       }
-                       do_timer(delta);
-               }
-
-               if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
-                       update_wallclock();
-                       if (keventd_up())
-                               schedule_work(&clock_was_set_work);
-               }
-
-               write_sequnlock(&xtime_lock);
-       }
-
-       delta = delta_cpu;
-       delta_cpu += local->processed_system - local->accounted_system;
-       if (delta >= NS_PER_TICK) {
-               do_div(delta, NS_PER_TICK);
-               local->processed_system += delta * NS_PER_TICK;
-       }
-
-       /*
-        * Account stolen ticks.
-        * ensures that the ticks are accounted as stolen.
-        */
-       stolen = runstate.time[RUNSTATE_runnable]
-                + runstate.time[RUNSTATE_offline]
-                - local->accounted_stolen;
-       if ((stolen > 0) && (delta_cpu > 0)) {
-               delta_cpu -= stolen;
-               if (unlikely(delta_cpu < 0))
-                       stolen += delta_cpu; /* clamp local-time progress */
-               do_div(stolen, NS_PER_TICK);
-               local->accounted_stolen += stolen * NS_PER_TICK;
-               local->accounted_system += stolen * NS_PER_TICK;
-               account_steal_time((cputime_t)stolen);
-       }
-
-       /*
-        * Account blocked ticks.
-        * ensures that the ticks are accounted as idle/wait.
-        */
-       blocked = runstate.time[RUNSTATE_blocked]
-                 - local->accounted_blocked;
-       if ((blocked > 0) && (delta_cpu > 0)) {
-               delta_cpu -= blocked;
-               if (unlikely(delta_cpu < 0))
-                       blocked += delta_cpu; /* clamp local-time progress */
-               do_div(blocked, NS_PER_TICK);
-               local->accounted_blocked += blocked * NS_PER_TICK;
-               local->accounted_system  += blocked * NS_PER_TICK;
-               account_idle_time((cputime_t)blocked);
-       }
-
-       /* Account user/system ticks. */
-       if (delta_cpu > 0) {
-               do_div(delta_cpu, NS_PER_TICK);
-               local->accounted_system += delta_cpu * NS_PER_TICK;
-               if (user_mode_vm(get_irq_regs()))
-                       account_user_time(current, (cputime_t)delta_cpu,
-                                         (cputime_t)delta_cpu);
-               else if (current != idle_task(cpu))
-                       account_system_time(current, HARDIRQ_OFFSET,
-                                           (cputime_t)delta_cpu,
-                                           (cputime_t)delta_cpu);
-               else
-                       account_idle_time((cputime_t)delta_cpu);
-       }
-
-       /* Offlined for more than a few seconds? Avoid lockup warnings. */
-       if (stolen > 5*HZ)
-               touch_softlockup_watchdog();
-
-       /* Local timer processing (see update_process_times()). */
-       run_local_timers();
-       rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs()));
-       printk_tick();
-       scheduler_tick();
-       run_posix_cpu_timers(current);
-       profile_tick(CPU_PROFILING);
-
-       return IRQ_HANDLED;
-}
-
  void mark_tsc_unstable(char *reason)
  {
  #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
@@ -605,24 +431,13 @@ void mark_tsc_unstable(char *reason)
  }
  EXPORT_SYMBOL_GPL(mark_tsc_unstable);
  
-static void init_missing_ticks_accounting(unsigned int cpu)
-{
-       struct vcpu_runstate_info *runstate = setup_runstate_area(cpu);
-
-       per_cpu(local_time.accounted_blocked, cpu) =
-               runstate->time[RUNSTATE_blocked];
-       per_cpu(local_time.accounted_stolen, cpu) =
-               runstate->time[RUNSTATE_runnable] +
-               runstate->time[RUNSTATE_offline];
-}
-
  static cycle_t cs_last;
  
  static cycle_t xen_clocksource_read(struct clocksource *cs)
  {
  #ifdef CONFIG_SMP
         cycle_t last = get64(&cs_last);
-       cycle_t ret = local_clock();
+       cycle_t ret = xen_local_clock();
  
         if (unlikely((s64)(ret - last) < 0)) {
                 if (last - ret > permitted_clock_jitter
@@ -648,38 +463,28 @@ static cycle_t xen_clocksource_read(struct clocksource *cs)
                 last = cur;
         }
  #else
-       return local_clock();
+       return xen_local_clock();
  #endif
  }
  
  /* No locking required. Interrupts are disabled on all CPUs. */
  static void xen_clocksource_resume(void)
  {
+       unsigned long seq;
         unsigned int cpu;
  
         init_cpu_khz();
  
-       for_each_online_cpu(cpu) {
-               switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
-                                          &xen_set_periodic_tick)) {
-               case 0:
-#if CONFIG_XEN_COMPAT <= 0x030004
-               case -ENOSYS:
-#endif
-                       break;
-               default:
-                       BUG();
-               }
+       for_each_online_cpu(cpu)
                 get_time_values_from_xen(cpu);
-               per_cpu(local_time.accounted_system, cpu) =
-               per_cpu(local_time.processed_system, cpu) =
-                       per_cpu(shadow_time, 0).system_timestamp;
-               init_missing_ticks_accounting(cpu);
-       }
  
-       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+       do {
+               seq = read_seqbegin(&xtime_lock);
+               jiffies_bias = jiffies_64;
+       } while (read_seqretry(&xtime_lock, seq));
+       system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
  
-       cs_last = local_clock();
+       cs_last = xen_local_clock();
  }
  
  static struct clocksource clocksource_xen = {
@@ -724,7 +529,7 @@ void xen_read_persistent_clock(struct timespec *ts)
                 rmb();
         } while ((s->wc_version & 1) | (version ^ s->wc_version));
  
-       delta = local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
+       delta = xen_local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
         do_div(delta, NSEC_PER_SEC);
  
         ts->tv_sec = delta;
@@ -739,43 +544,17 @@ int xen_update_persistent_clock(void)
         return 0;
  }
  
-/* Dynamically-mapped IRQ. */
-static int __read_mostly timer_irq = -1;
-static struct irqaction timer_action = {
-       .handler = timer_interrupt,
-       .flags   = IRQF_DISABLED|IRQF_TIMER,
-       .name    = "timer"
-};
-
-static void __init setup_cpu0_timer_irq(void)
-{
-       timer_irq = bind_virq_to_irqaction(VIRQ_TIMER, 0, &timer_action);
-       BUG_ON(timer_irq < 0);
-}
-
  void __init time_init(void)
  {
         init_cpu_khz();
         printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
                cpu_khz / 1000, cpu_khz % 1000);
  
-       switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, 0,
-                                  &xen_set_periodic_tick)) {
-       case 0:
-#if CONFIG_XEN_COMPAT <= 0x030004
-       case -ENOSYS:
-#endif
-               break;
-       default:
-               BUG();
-       }
-
+       setup_runstate_area(0);
         get_time_values_from_xen(0);
  
-       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
-       per_cpu(local_time.processed_system, 0) = processed_system_time;
-       per_cpu(local_time.accounted_system, 0) = processed_system_time;
-       init_missing_ticks_accounting(0);
+       jiffies_bias     = jiffies_64;
+       system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
  
         clocksource_register(&clocksource_xen);
  
@@ -784,10 +563,7 @@ void __init time_init(void)
         use_tsc_delay();
  
         /* Cannot request_irq() until kmem is initialised. */
-       late_time_init = setup_cpu0_timer_irq;
-
-       if (!(duty_limit + 2))
-               duty_limit = __fls(nr_cpu_ids);
+       late_time_init = xen_clockevents_init;
  }
  
  /* Convert jiffies to system time. */
@@ -803,13 +579,13 @@ u64 jiffies_to_st(unsigned long j)
                 if (delta < 1) {
                         /* Triggers in some wrap-around cases, but that's okay:
                          * we just end up with a shorter timeout. */
-                       st = processed_system_time + NS_PER_TICK;
+                       st = processed_system_time() + NS_PER_TICK;
                 } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
                         /* Very long timeout means there is no pending timer.
                          * We indicate this to Xen by passing zero timeout. */
                         st = 0;
                 } else {
-                       st = processed_system_time + delta * (u64)NS_PER_TICK;
+                       st = processed_system_time() + delta * (u64)NS_PER_TICK;
                 }
         } while (read_seqretry(&xtime_lock, seq));
  
@@ -817,79 +593,10 @@ u64 jiffies_to_st(unsigned long j)
  }
  EXPORT_SYMBOL(jiffies_to_st);
  
-/*
- * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
- * These functions are based on implementations from arch/s390/kernel/time.c
- */
-static void stop_hz_timer(void)
-{
-       struct vcpu_set_singleshot_timer singleshot;
-       unsigned int cpu = smp_processor_id();
-       unsigned long j;
-       u64 local;
-       int rc;
-
-       cpumask_set_cpu(cpu, nohz_cpu_mask);
-
-       /* See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs  */
-       /* ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a  */
-       /* value of rcp->cur that matches rdp->quiescbatch and allows us to  */
-       /* stop the hz timer then the cpumasks created for subsequent values */
-       /* of cur in rcu_start_batch are guaranteed to pick up the updated   */
-       /* nohz_cpu_mask and so will not depend on this cpu.                 */
-
-       smp_mb();
-
-       /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
-       if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
-           local_softirq_pending() ||
-           (j = get_next_timer_interrupt(jiffies),
-            time_before_eq(j, jiffies))) {
-               cpumask_clear_cpu(cpu, nohz_cpu_mask);
-               j = jiffies + 1;
-       }
-
-       singleshot.timeout_abs_ns = jiffies_to_st(j);
-       if (!singleshot.timeout_abs_ns)
-               return;
-       local = per_cpu(local_time.processed_system, cpu);
-       if ((s64)(singleshot.timeout_abs_ns - local) <= NS_PER_TICK) {
-               cpumask_clear_cpu(cpu, nohz_cpu_mask);
-               singleshot.timeout_abs_ns = local + NS_PER_TICK;
-       }
-       singleshot.timeout_abs_ns += NS_PER_TICK / 2;
-       singleshot.flags = 0;
-       rc = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &singleshot);
-#if CONFIG_XEN_COMPAT <= 0x030004
-       if (rc) {
-               BUG_ON(rc != -ENOSYS);
-               rc = HYPERVISOR_set_timer_op(singleshot.timeout_abs_ns);
-       }
-#endif
-       BUG_ON(rc);
-}
-
-static void start_hz_timer(void)
-{
-       unsigned int cpu = smp_processor_id();
-       int rc = HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL);
-
-#if CONFIG_XEN_COMPAT <= 0x030004
-       if (rc) {
-               BUG_ON(rc != -ENOSYS);
-               rc = HYPERVISOR_set_timer_op(0);
-       }
-#endif
-       BUG_ON(rc);
-       cpumask_clear_cpu(cpu, nohz_cpu_mask);
-}
-
  void xen_safe_halt(void)
  {
-       stop_hz_timer();
         /* Blocking includes an implicit local_irq_enable(). */
         HYPERVISOR_block();
-       start_hz_timer();
  }
  EXPORT_SYMBOL(xen_safe_halt);
  
@@ -900,48 +607,6 @@ void xen_halt(void)
  }
  EXPORT_SYMBOL(xen_halt);
  
-#ifdef CONFIG_SMP
-int __cpuinit local_setup_timer(unsigned int cpu)
-{
-       int seq, irq;
-
-       BUG_ON(cpu == 0);
-
-       switch (HYPERVISOR_vcpu_op(VCPUOP_set_periodic_timer, cpu,
-                          &xen_set_periodic_tick)) {
-       case 0:
-#if CONFIG_XEN_COMPAT <= 0x030004
-       case -ENOSYS:
-#endif
-               break;
-       default:
-               BUG();
-       }
-
-       do {
-               seq = read_seqbegin(&xtime_lock);
-               /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
-               per_cpu(local_time.accounted_system, cpu) =
-               per_cpu(local_time.processed_system, cpu) =
-                       per_cpu(shadow_time, 0).system_timestamp;
-               init_missing_ticks_accounting(cpu);
-       } while (read_seqretry(&xtime_lock, seq));
-
-       irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
-       if (irq < 0)
-               return irq;
-       BUG_ON(timer_irq != irq);
-
-       return 0;
-}
-
-void __cpuinit local_teardown_timer(unsigned int cpu)
-{
-       BUG_ON(cpu == 0);
-       unbind_from_per_cpu_irq(timer_irq, cpu, &timer_action);
-}
-#endif
-
  #ifdef CONFIG_CPU_FREQ
  static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 
                                 void *data)
diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig

index f20b8bc..cb0a327 100644 (file)
--- a/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@ -1,6 +1,6 @@
  config DRM_VMWGFX
         tristate "DRM driver for VMware Virtual GPU"
-       depends on DRM && PCI
+       depends on DRM && PCI && !XEN
         select FB_DEFERRED_IO
         select FB_CFB_FILLRECT
         select FB_CFB_COPYAREA
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig

index 4d3c0e7..c5fc13d 100644 (file)
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -385,9 +385,6 @@ config HAVE_IRQ_IGNORE_UNHANDLED
  config IRQ_PER_CPU
         bool
  
-config NO_IDLE_HZ
-       def_bool y
-
  config XEN_SMPBOOT
         def_bool y
         depends on SMP && !PPC_XEN
diff --git a/drivers/xen/blkfront/blkfront.c b/drivers/xen/blkfront/blkfront.c

index 80aedd9..ef3d4be 100644 (file)
--- a/drivers/xen/blkfront/blkfront.c
+++ b/drivers/xen/blkfront/blkfront.c
@@ -290,8 +290,10 @@ static void backend_changed(struct xenbus_device *dev,
                         break;
                 }
                 bd = bdget_disk(info->gd, 0);
-               if (bd == NULL)
+               if (bd == NULL) {
                         xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
+                       break;
+               }
  
  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
                 down(&bd->bd_sem);
diff --git a/drivers/xen/blktap2/device.c b/drivers/xen/blktap2/device.c

index 9048524..2cbc29a 100644 (file)
--- a/drivers/xen/blktap2/device.c
+++ b/drivers/xen/blktap2/device.c
@@ -950,8 +950,6 @@ blktap_device_restart(struct blktap *tap)
         struct blktap_device *dev;
  
         dev = &tap->device;
-       if (!dev->gd || !dev->gd->queue)
-               return;
  
         if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
                 blktap_defer(tap);
@@ -967,11 +965,15 @@ blktap_device_restart(struct blktap *tap)
         spin_lock_irq(&dev->lock);
  
         /* Re-enable calldowns. */
-       if (blk_queue_stopped(dev->gd->queue))
-               blk_start_queue(dev->gd->queue);
+       if (dev->gd) {
+               struct request_queue *rq = dev->gd->queue;
+
+               if (blk_queue_stopped(rq))
+                       blk_start_queue(rq);
  
-       /* Kick things off immediately. */
-       blktap_device_do_request(dev->gd->queue);
+               /* Kick things off immediately. */
+               blktap_device_do_request(rq);
+       }
  
         spin_unlock_irq(&dev->lock);
  }
@@ -1060,6 +1062,7 @@ int
  blktap_device_destroy(struct blktap *tap)
  {
         struct blktap_device *dev = &tap->device;
+       struct gendisk *gd = dev->gd;
  
         if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
                 return 0;
@@ -1071,8 +1074,9 @@ blktap_device_destroy(struct blktap *tap)
  
         spin_lock_irq(&dev->lock);
         /* No more blktap_device_do_request(). */
-       blk_stop_queue(dev->gd->queue);
+       blk_stop_queue(gd->queue);
         clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+       dev->gd = NULL;
         spin_unlock_irq(&dev->lock);
  
  #ifdef ENABLE_PASSTHROUGH
@@ -1080,11 +1084,9 @@ blktap_device_destroy(struct blktap *tap)
                 blktap_device_close_bdev(tap);
  #endif
  
-       del_gendisk(dev->gd);
-       put_disk(dev->gd);
-       blk_cleanup_queue(dev->gd->queue);
-
-       dev->gd = NULL;
+       del_gendisk(gd);
+       blk_cleanup_queue(gd->queue);
+       put_disk(gd);
  
         wake_up(&tap->wq);
  
diff --git a/drivers/xen/blktap2/sysfs.c b/drivers/xen/blktap2/sysfs.c

index 8a7f8aa..04ade44 100644 (file)
--- a/drivers/xen/blktap2/sysfs.c
+++ b/drivers/xen/blktap2/sysfs.c
@@ -352,7 +352,6 @@ _blktap_sysfs_destroy(struct device *dev)
         device_remove_file(dev, &dev_attr_resume);
         device_remove_file(dev, &dev_attr_debug);
  
-       put_device(dev);
         device_unregister(dev);
  
         clear_bit(BLKTAP_SYSFS, &tap->dev_inuse);
diff --git a/drivers/xen/core/Makefile b/drivers/xen/core/Makefile

index 6ed74e9..25854c2 100644 (file)
--- a/drivers/xen/core/Makefile
+++ b/drivers/xen/core/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_XEN_SYSFS)               += xen_sysfs.o
  obj-$(CONFIG_XEN_SMPBOOT)      += smpboot.o
  obj-$(CONFIG_SMP)              += spinlock.o
  obj-$(CONFIG_KEXEC)            += machine_kexec.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
  obj-$(CONFIG_XEN_DOMCTL)       += domctl.o
  CFLAGS_domctl.o                        := -D__XEN_PUBLIC_XEN_H__ -D__XEN_PUBLIC_GRANT_TABLE_H__
  CFLAGS_domctl.o                        += -D__XEN_TOOLS__ -imacros xen/interface/domctl.h
diff --git a/drivers/xen/core/clockevents.c b/drivers/xen/core/clockevents.c

new file mode 100644 (file)

index 0000000..266e198
--- /dev/null
+++ b/drivers/xen/core/clockevents.c
@@ -0,0 +1,298 @@
+/*
+ *     Xen clockevent functions
+ *
+ *     See arch/x86/xen/time.c for copyright and credits for derived
+ *     portions of this file.
+ *
+ * Xen clockevent implementation
+ *
+ * Xen has two clockevent implementations:
+ *
+ * The old timer_op one works with all released versions of Xen prior
+ * to version 3.0.4.  This version of the hypervisor provides a
+ * single-shot timer with nanosecond resolution.  However, sharing the
+ * same event channel is a 100Hz tick which is delivered while the
+ * vcpu is running.  We don't care about or use this tick, but it will
+ * cause the core time code to think the timer fired too soon, and
+ * will end up resetting it each time.  It could be filtered, but
+ * doing so has complications when the ktime clocksource is not yet
+ * the xen clocksource (ie, at boot time).
+ *
+ * The new vcpu_op-based timer interface allows the tick timer period
+ * to be changed or turned off.  The tick timer is not useful as a
+ * periodic timer because events are only delivered to running vcpus.
+ * The one-shot timer can report when a timeout is in the past, so
+ * set_next_event is capable of returning -ETIME when appropriate.
+ * This interface is used when available.
+ */
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/math64.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/clock.h>
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP     100000
+#define NS_PER_TICK    (1000000000LL / HZ)
+
+/*
+ * Get a hypervisor absolute time.  In theory we could maintain an
+ * offset between the kernel's time and the hypervisor's time, and
+ * apply that to a kernel's absolute timeout.  Unfortunately the
+ * hypervisor and kernel times can drift even if the kernel is using
+ * the Xen clocksource, because ntp can warp the kernel's clocksource.
+ */
+static u64 get_abs_timeout(unsigned long delta)
+{
+       return xen_local_clock() + delta;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030004
+static void timerop_set_mode(enum clock_event_mode mode,
+                            struct clock_event_device *evt)
+{
+       switch (mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+               WARN_ON(1); /* unsupported */
+               break;
+
+       case CLOCK_EVT_MODE_ONESHOT:
+       case CLOCK_EVT_MODE_RESUME:
+               break;
+
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               if (HYPERVISOR_set_timer_op(0)) /* cancel timeout */
+                       BUG();
+               break;
+       }
+}
+
+static int timerop_set_next_event(unsigned long delta,
+                                 struct clock_event_device *evt)
+{
+       WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+       if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+               BUG();
+
+       /*
+        * We may have missed the deadline, but there's no real way of
+        * knowing for sure.  If the event was in the past, then we'll
+        * get an immediate interrupt.
+        */
+
+       return 0;
+}
+#endif
+
+static void vcpuop_set_mode(enum clock_event_mode mode,
+                           struct clock_event_device *evt)
+{
+       switch (mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+               WARN_ON(1); /* unsupported */
+               break;
+
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer,
+                                      smp_processor_id(), NULL))
+                       BUG();
+               /* fall through */
+       case CLOCK_EVT_MODE_ONESHOT:
+               if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+                                      smp_processor_id(), NULL))
+                       BUG();
+               break;
+
+       case CLOCK_EVT_MODE_RESUME:
+               break;
+       }
+}
+
+static int vcpuop_set_next_event(unsigned long delta,
+                                struct clock_event_device *evt)
+{
+       struct vcpu_set_singleshot_timer single;
+       int ret;
+
+       WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+       single.timeout_abs_ns = get_abs_timeout(delta);
+       single.flags = VCPU_SSHOTTMR_future;
+
+       ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer,
+                                smp_processor_id(), &single);
+
+       BUG_ON(ret != 0 && ret != -ETIME);
+
+       return ret;
+}
+
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_event) = {
+       .name           = "xen",
+       .features       = CLOCK_EVT_FEAT_ONESHOT,
+
+       .max_delta_ns   = 0xffffffff,
+       .min_delta_ns   = TIMER_SLOP,
+
+       .mult           = 1,
+       .shift          = 0,
+       .rating         = 500,
+
+       .irq            = -1,
+};
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
+
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(unsigned int, xen_residual_stolen);
+static DEFINE_PER_CPU(unsigned int, xen_residual_blocked);
+
+static void init_missing_ticks_accounting(unsigned int cpu)
+{
+       per_cpu(xen_runstate_snapshot, cpu) = *setup_runstate_area(cpu);
+       if (cpu == smp_processor_id())
+               get_runstate_snapshot(&__get_cpu_var(xen_runstate_snapshot));
+       per_cpu(xen_residual_stolen, cpu) = 0;
+       per_cpu(xen_residual_blocked, cpu) = 0;
+}
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+       struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
+       struct vcpu_runstate_info state, *snap;
+       s64 blocked, stolen;
+       irqreturn_t ret = IRQ_NONE;
+
+       if (evt->event_handler) {
+               evt->event_handler(evt);
+               ret = IRQ_HANDLED;
+       }
+
+       xen_check_wallclock_update();
+
+       get_runstate_snapshot(&state);
+       snap = &__get_cpu_var(xen_runstate_snapshot);
+
+       stolen = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]
+               + state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]
+               + percpu_read(xen_residual_stolen);
+
+       if (stolen >= NS_PER_TICK)
+               account_steal_ticks(div_u64_rem(stolen, NS_PER_TICK,
+                               &__get_cpu_var(xen_residual_stolen)));
+       else
+               percpu_write(xen_residual_stolen, stolen > 0 ? stolen : 0);
+
+       blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]
+               + percpu_read(xen_residual_blocked);
+
+       if (blocked >= NS_PER_TICK)
+               account_idle_ticks(div_u64_rem(blocked, NS_PER_TICK,
+                               &__get_cpu_var(xen_residual_blocked)));
+       else
+               percpu_write(xen_residual_blocked, blocked > 0 ? blocked : 0);
+
+       *snap = state;
+
+       return ret;
+}
+
+static struct irqaction timer_action = {
+       .handler = timer_interrupt,
+       .flags   = IRQF_DISABLED|IRQF_TIMER,
+       .name    = "timer"
+};
+
+void __cpuinit xen_setup_cpu_clockevents(void)
+{
+       unsigned int cpu = smp_processor_id();
+       struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+       init_missing_ticks_accounting(cpu);
+
+       evt->cpumask = cpumask_of(cpu);
+       clockevents_register_device(evt);
+}
+
+#ifdef CONFIG_SMP
+int __cpuinit local_setup_timer(unsigned int cpu)
+{
+       struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+       BUG_ON(cpu == smp_processor_id());
+
+       evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
+       if (evt->irq < 0)
+               return evt->irq;
+       BUG_ON(per_cpu(xen_clock_event.irq, 0) != evt->irq);
+
+       evt->set_mode = percpu_read(xen_clock_event.set_mode);
+       evt->set_next_event = percpu_read(xen_clock_event.set_next_event);
+
+       return 0;
+}
+
+void __cpuinit local_teardown_timer(unsigned int cpu)
+{
+       struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+       BUG_ON(cpu == 0);
+       unbind_from_per_cpu_irq(evt->irq, cpu, &timer_action);
+}
+#endif
+
+void xen_clockevents_resume(void)
+{
+       unsigned int cpu;
+
+       if (percpu_read(xen_clock_event.set_mode) != vcpuop_set_mode)
+               return;
+
+       for_each_online_cpu(cpu) {
+               init_missing_ticks_accounting(cpu);
+               if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                       BUG();
+       }
+}
+
+void __init xen_clockevents_init(void)
+{
+       unsigned int cpu = smp_processor_id();
+       struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
+
+       switch (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+                                  cpu, NULL)) {
+       case 0:
+               /*
+                * Successfully turned off 100Hz tick, so we have the
+                * vcpuop-based timer interface
+                */
+               evt->set_mode = vcpuop_set_mode;
+               evt->set_next_event = vcpuop_set_next_event;
+               break;
+#if CONFIG_XEN_COMPAT <= 0x030004
+       case -ENOSYS:
+               printk(KERN_DEBUG "Xen: using timerop interface\n");
+               evt->set_mode = timerop_set_mode;
+               evt->set_next_event = timerop_set_next_event;
+               break;
+#endif
+       default:
+               BUG();
+       }
+
+       evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
+       BUG_ON(evt->irq < 0);
+
+       xen_setup_cpu_clockevents();
+}
diff --git a/drivers/xen/core/machine_reboot.c b/drivers/xen/core/machine_reboot.c

index 648a309..f076184 100644 (file)
--- a/drivers/xen/core/machine_reboot.c
+++ b/drivers/xen/core/machine_reboot.c
@@ -13,6 +13,7 @@
  #include <asm/hypervisor.h>
  #include <xen/xenbus.h>
  #include <linux/cpu.h>
+#include <xen/clock.h>
  #include <xen/gnttab.h>
  #include <xen/xencons.h>
  #include <xen/cpu_hotplug.h>
@@ -176,10 +177,12 @@ static int take_machine_down(void *_suspend)
         } else
                 BUG_ON(suspend_cancelled > 0);
         suspend->resume_notifier(suspend_cancelled);
-       if (suspend_cancelled >= 0) {
+       if (suspend_cancelled >= 0)
                 post_suspend(suspend_cancelled, suspend->fast_suspend);
+       if (!suspend_cancelled)
+               xen_clockevents_resume();
+       if (suspend_cancelled >= 0)
                 sysdev_resume();
-       }
         if (!suspend_cancelled) {
  #ifdef __x86_64__
                 /*
diff --git a/drivers/xen/core/smpboot.c b/drivers/xen/core/smpboot.c

index 588d63c..be85cd4 100644 (file)
--- a/drivers/xen/core/smpboot.c
+++ b/drivers/xen/core/smpboot.c
@@ -19,6 +19,7 @@
  #include <linux/percpu.h>
  #include <asm/desc.h>
  #include <asm/pgalloc.h>
+#include <xen/clock.h>
  #include <xen/evtchn.h>
  #include <xen/interface/vcpu.h>
  #include <xen/cpu_hotplug.h>
@@ -198,6 +199,7 @@ static void __cpuinit cpu_bringup(void)
         identify_secondary_cpu(&current_cpu_data);
         touch_softlockup_watchdog();
         preempt_disable();
+       xen_setup_cpu_clockevents();
         local_irq_enable();
  }
  
diff --git a/drivers/xen/core/spinlock.c b/drivers/xen/core/spinlock.c

index eb21d94..63a6c3c 100644 (file)
--- a/drivers/xen/core/spinlock.c
+++ b/drivers/xen/core/spinlock.c
@@ -10,6 +10,7 @@
  #include <linux/kernel.h>
  #include <linux/kernel_stat.h>
  #include <linux/module.h>
+#include <xen/clock.h>
  #include <xen/evtchn.h>
  
  #ifdef TICKET_SHIFT
@@ -189,6 +190,7 @@ bool xen_spin_wait(arch_spinlock_t *lock, unsigned int *ptok,
         other = spinning.prev;
         percpu_write(spinning, other);
         rm_lock = &__get_cpu_var(spinning_rm_lock);
+       raw_local_irq_disable();
         arch_write_lock(rm_lock);
         arch_write_unlock(rm_lock);
         *ptok = lock->cur | (spinning.ticket << TICKET_SHIFT);
@@ -203,7 +205,6 @@ bool xen_spin_wait(arch_spinlock_t *lock, unsigned int *ptok,
                         bool free;
  
                         lock = other->lock;
-                       raw_local_irq_disable();
                         __ticket_spin_lock_preamble;
                         if (!free)
                                 token = spin_adjust(other->prev, lock, token);
diff --git a/include/xen/clock.h b/include/xen/clock.h

new file mode 100644 (file)

index 0000000..935b433
--- /dev/null
+++ b/include/xen/clock.h
@@ -0,0 +1,19 @@
+#ifndef __XEN_CPU_CLOCK_H__
+#define __XEN_CPU_CLOCK_H__
+
+struct vcpu_runstate_info *setup_runstate_area(unsigned int cpu);
+void get_runstate_snapshot(struct vcpu_runstate_info *);
+
+unsigned long long xen_local_clock(void);
+void xen_check_wallclock_update(void);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+void xen_clockevents_init(void);
+void xen_setup_cpu_clockevents(void);
+void xen_clockevents_resume(void);
+#else
+static inline void xen_setup_cpu_clockevents(void) {}
+static inline void xen_clockevents_resume(void) {}
+#endif
+
+#endif /* __XEN_CPU_CLOCK_H__ */
diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h

index 0978fb1..928f392 100644 (file)
--- a/include/xen/evtchn.h
+++ b/include/xen/evtchn.h
@@ -47,6 +47,8 @@
  /*
   * LOW-LEVEL DEFINITIONS
   */
+
+#ifdef CONFIG_XEN
  struct irq_cfg {
         u32 info;
         union {
@@ -56,6 +58,7 @@ struct irq_cfg {
  #endif
         };
  };
+#endif
  
  /*
   * Dynamically bind an event source to an IRQ-like callback handler.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c

index 5154d2f..0086628 100644 (file)
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1108,7 +1108,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
  }
  EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
  
-#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
+#ifdef CONFIG_NO_HZ
  /**
   * hrtimer_get_next_event - get the time until next expiry event
   *
diff --git a/kernel/timer.c b/kernel/timer.c

index d84f9c0..c61a794 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1042,7 +1042,7 @@ static inline void __run_timers(struct tvec_base *base)
         spin_unlock_irq(&base->lock);
  }
  
-#if defined(CONFIG_NO_HZ) || defined(CONFIG_NO_IDLE_HZ)
+#ifdef CONFIG_NO_HZ
  /*
   * Find out when the next timer event is due to happen. This
   * is used on S/390 to stop all activity when a CPU is idle.
author	Jan Beulich <jbeulich@novell.com>
	Tue, 2 Mar 2010 16:27:17 +0000 (17:27 +0100)
committer	Jan Beulich <jbeulich@novell.com>
	Tue, 2 Mar 2010 16:27:17 +0000 (17:27 +0100)
arch/x86/Kconfig		patch \| blob \| history
arch/x86/include/mach-xen/asm/hypervisor.h		patch \| blob \| history
arch/x86/kernel/time-xen.c		patch \| blob \| history
drivers/gpu/drm/vmwgfx/Kconfig		patch \| blob \| history
drivers/xen/Kconfig		patch \| blob \| history
drivers/xen/blkfront/blkfront.c		patch \| blob \| history
drivers/xen/blktap2/device.c		patch \| blob \| history
drivers/xen/blktap2/sysfs.c		patch \| blob \| history
drivers/xen/core/Makefile		patch \| blob \| history
drivers/xen/core/clockevents.c	[new file with mode: 0644]	patch \| blob
drivers/xen/core/machine_reboot.c		patch \| blob \| history
drivers/xen/core/smpboot.c		patch \| blob \| history
drivers/xen/core/spinlock.c		patch \| blob \| history
include/xen/clock.h	[new file with mode: 0644]	patch \| blob
include/xen/evtchn.h		patch \| blob \| history
kernel/hrtimer.c		patch \| blob \| history
kernel/timer.c		patch \| blob \| history