sched/x86: Fix overflow in cyc2ns_offset

[linux-flexiantxendom0.git] / arch / x86 / kernel / tsc.c
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c

index 9eeb9be..9a987d4 100644 (file)
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -5,7 +5,6 @@
  #include <linux/timer.h>
  #include <linux/acpi_pmtmr.h>
  #include <linux/cpufreq.h>
-#include <linux/dmi.h>
  #include <linux/delay.h>
  #include <linux/clocksource.h>
  #include <linux/percpu.h>
@@ -50,7 +49,7 @@ u64 native_sched_clock(void)
          *   unstable. We do this because unlike Time Of Day,
          *   the scheduler clock tolerates small errors and it's
          *   very important for it to be as fast as the platform
-        *   can achive it. )
+        *   can achieve it. )
          */
         if (unlikely(tsc_disabled)) {
                 /* No locking but a rare wrong value is not a big deal: */
@@ -104,10 +103,14 @@ int __init notsc_setup(char *str)
  
  __setup("notsc", notsc_setup);
  
+static int no_sched_irq_time;
+
  static int __init tsc_setup(char *str)
  {
         if (!strcmp(str, "reliable"))
                 tsc_clocksource_reliable = 1;
+       if (!strncmp(str, "noirqtime", 9))
+               no_sched_irq_time = 1;
         return 1;
  }
  
@@ -287,14 +290,15 @@ static inline int pit_verify_msb(unsigned char val)
  static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
  {
         int count;
-       u64 tsc = 0;
+       u64 tsc = 0, prev_tsc = 0;
  
         for (count = 0; count < 50000; count++) {
                 if (!pit_verify_msb(val))
                         break;
+               prev_tsc = tsc;
                 tsc = get_cycles();
         }
-       *deltap = get_cycles() - tsc;
+       *deltap = get_cycles() - prev_tsc;
         *tscp = tsc;
  
         /*
@@ -308,9 +312,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
   * How many MSB values do we want to see? We aim for
   * a maximum error rate of 500ppm (in practice the
   * real error is much smaller), but refuse to spend
- * more than 25ms on it.
+ * more than 50ms on it.
   */
-#define MAX_QUICK_PIT_MS 25
+#define MAX_QUICK_PIT_MS 50
  #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
  
  static unsigned long quick_pit_calibrate(void)
@@ -380,15 +384,12 @@ success:
          *
          * As a result, we can depend on there not being
          * any odd delays anywhere, and the TSC reads are
-        * reliable (within the error). We also adjust the
-        * delta to the middle of the error bars, just
-        * because it looks nicer.
+        * reliable (within the error).
          *
          * kHz = ticks / time-in-seconds / 1000;
          * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
          * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
          */
-       delta += (long)(d2 - d1)/2;
         delta *= PIT_TICK_RATE;
         do_div(delta, i*256*1000);
         printk("Fast TSC calibration using PIT\n");
@@ -423,7 +424,7 @@ unsigned long native_calibrate_tsc(void)
          * the delta to the previous read. We keep track of the min
          * and max values of that delta. The delta is mostly defined
          * by the IO time of the PIT access, so we can detect when a
-        * SMI/SMM disturbance happend between the two reads. If the
+        * SMI/SMM disturbance happened between the two reads. If the
          * maximum time is significantly larger than the minimum time,
          * then we discard the result and have another try.
          *
@@ -460,7 +461,7 @@ unsigned long native_calibrate_tsc(void)
                 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
  
                 /* hpet or pmtimer available ? */
-               if (!hpet && !ref1 && !ref2)
+               if (ref1 == ref2)
                         continue;
  
                 /* Check, whether the sampling was disturbed by an SMI */
@@ -619,13 +620,52 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
  
         if (cpu_khz) {
                 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
-               *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
+               *offset = ns_now - mult_frac(tsc_now, *scale,
+                                            (1UL << CYC2NS_SCALE_FACTOR));
         }
  
         sched_clock_idle_wakeup_event(0);
         local_irq_restore(flags);
  }
  
+static unsigned long long cyc2ns_suspend;
+
+void save_sched_clock_state(void)
+{
+       if (!sched_clock_stable)
+               return;
+
+       cyc2ns_suspend = sched_clock();
+}
+
+/*
+ * Even on processors with invariant TSC, TSC gets reset in some the
+ * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
+ * arbitrary value (still sync'd across cpu's) during resume from such sleep
+ * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
+ * that sched_clock() continues from the point where it was left off during
+ * suspend.
+ */
+void restore_sched_clock_state(void)
+{
+       unsigned long long offset;
+       unsigned long flags;
+       int cpu;
+
+       if (!sched_clock_stable)
+               return;
+
+       local_irq_save(flags);
+
+       __this_cpu_write(cyc2ns_offset, 0);
+       offset = cyc2ns_suspend - sched_clock();
+
+       for_each_possible_cpu(cpu)
+               per_cpu(cyc2ns_offset, cpu) = offset;
+
+       local_irq_restore(flags);
+}
+
  #ifdef CONFIG_CPU_FREQ
  
  /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -721,25 +761,6 @@ static cycle_t read_tsc(struct clocksource *cs)
                 ret : clocksource_tsc.cycle_last;
  }
  
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
-       cycle_t ret;
-
-       /*
-        * Surround the RDTSC by barriers, to make sure it's not
-        * speculated to outside the seqlock critical section and
-        * does not cause time warps:
-        */
-       rdtsc_barrier();
-       ret = (cycle_t)vget_cycles();
-       rdtsc_barrier();
-
-       return ret >= __vsyscall_gtod_data.clock.cycle_last ?
-               ret : __vsyscall_gtod_data.clock.cycle_last;
-}
-#endif
-
  static void resume_tsc(struct clocksource *cs)
  {
         clocksource_tsc.cycle_last = 0;
@@ -751,11 +772,10 @@ static struct clocksource clocksource_tsc = {
         .read                   = read_tsc,
         .resume                 = resume_tsc,
         .mask                   = CLOCKSOURCE_MASK(64),
-       .shift                  = 22,
         .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
                                   CLOCK_SOURCE_MUST_VERIFY,
  #ifdef CONFIG_X86_64
-       .vread                  = vread_tsc,
+       .archdata               = { .vclock_mode = VCLOCK_TSC },
  #endif
  };
  
@@ -764,6 +784,7 @@ void mark_tsc_unstable(char *reason)
         if (!tsc_unstable) {
                 tsc_unstable = 1;
                 sched_clock_stable = 0;
+               disable_sched_clock_irqtime();
                 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
                 /* Change only the rating, when not registered */
                 if (clocksource_tsc.mult)
@@ -777,27 +798,6 @@ void mark_tsc_unstable(char *reason)
  
  EXPORT_SYMBOL_GPL(mark_tsc_unstable);
  
-static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
-{
-       printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
-                       d->ident);
-       tsc_unstable = 1;
-       return 0;
-}
-
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
-       {
-               .callback = dmi_mark_tsc_unstable,
-               .ident = "IBM Thinkpad 380XD",
-               .matches = {
-                       DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
-                       DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
-               },
-       },
-       {}
-};
-
  static void __init check_system_tsc_reliable(void)
  {
  #ifdef CONFIG_MGEODE_LX
@@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void)
         unsigned long res_low, res_high;
  
         rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
-       /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
+       /* Geode_LX - the OLPC CPU has a very reliable TSC */
         if (res_low & RTSC_SUSP)
                 tsc_clocksource_reliable = 1;
  #endif
@@ -830,6 +830,9 @@ __cpuinit int unsynchronized_tsc(void)
  
         if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                 return 0;
+
+       if (tsc_clocksource_reliable)
+               return 0;
         /*
          * Intel systems are normally all synchronized.
          * Exceptions must mark TSC as unstable:
@@ -837,16 +840,92 @@ __cpuinit int unsynchronized_tsc(void)
         if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
                 /* assume multi socket systems are not synchronized: */
                 if (num_possible_cpus() > 1)
-                       tsc_unstable = 1;
+                       return 1;
         }
  
-       return tsc_unstable;
+       return 0;
  }
  
-static void __init init_tsc_clocksource(void)
+
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomalies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
  {
-       clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
-                       clocksource_tsc.shift);
+       static u64 tsc_start = -1, ref_start;
+       static int hpet;
+       u64 tsc_stop, ref_stop, delta;
+       unsigned long freq;
+
+       /* Don't bother refining TSC on unstable systems */
+       if (check_tsc_unstable())
+               goto out;
+
+       /*
+        * Since the work is started early in boot, we may be
+        * delayed the first time we expire. So set the workqueue
+        * again once we know timers are working.
+        */
+       if (tsc_start == -1) {
+               /*
+                * Only set hpet once, to avoid mixing hardware
+                * if the hpet becomes enabled later.
+                */
+               hpet = is_hpet_enabled();
+               schedule_delayed_work(&tsc_irqwork, HZ);
+               tsc_start = tsc_read_refs(&ref_start, hpet);
+               return;
+       }
+
+       tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+       /* hpet or pmtimer available ? */
+       if (ref_start == ref_stop)
+               goto out;
+
+       /* Check, whether the sampling was disturbed by an SMI */
+       if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+               goto out;
+
+       delta = tsc_stop - tsc_start;
+       delta *= 1000000LL;
+       if (hpet)
+               freq = calc_hpet_ref(delta, ref_start, ref_stop);
+       else
+               freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+       /* Make sure we're within 1% */
+       if (abs(tsc_khz - freq) > tsc_khz/100)
+               goto out;
+
+       tsc_khz = freq;
+       printk(KERN_INFO "Refined TSC clocksource calibration: "
+               "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+                                       (unsigned long)tsc_khz % 1000);
+
+out:
+       clocksource_register_khz(&clocksource_tsc, tsc_khz);
+}
+
+
+static int __init init_tsc_clocksource(void)
+{
+       if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
+               return 0;
+
         if (tsc_clocksource_reliable)
                 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
         /* lower the rating if we already know its unstable: */
@@ -854,62 +933,24 @@ static void __init init_tsc_clocksource(void)
                 clocksource_tsc.rating = 0;
                 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
         }
-       clocksource_register(&clocksource_tsc);
-}
-
-#ifdef CONFIG_X86_64
-/*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency
- */
-#define TICK_COUNT 100000000
-static unsigned long __init calibrate_cpu(void)
-{
-       int tsc_start, tsc_now;
-       int i, no_ctr_free;
-       unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
-       unsigned long flags;
-
-       for (i = 0; i < 4; i++)
-               if (avail_to_resrv_perfctr_nmi_bit(i))
-                       break;
-       no_ctr_free = (i == 4);
-       if (no_ctr_free) {
-               WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
-                    "cpu_khz value may be incorrect.\n");
-               i = 3;
-               rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
-               wrmsrl(MSR_K7_EVNTSEL3, 0);
-               rdmsrl(MSR_K7_PERFCTR3, pmc3);
-       } else {
-               reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-               reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-       }
-       local_irq_save(flags);
-       /* start measuring cycles, incrementing from 0 */
-       wrmsrl(MSR_K7_PERFCTR0 + i, 0);
-       wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
-       rdtscl(tsc_start);
-       do {
-               rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-               tsc_now = get_cycles();
-       } while ((tsc_now - tsc_start) < TICK_COUNT);
  
-       local_irq_restore(flags);
-       if (no_ctr_free) {
-               wrmsrl(MSR_K7_EVNTSEL3, 0);
-               wrmsrl(MSR_K7_PERFCTR3, pmc3);
-               wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
-       } else {
-               release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-               release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+       /*
+        * Trust the results of the earlier calibration on systems
+        * exporting a reliable TSC.
+        */
+       if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+               clocksource_register_khz(&clocksource_tsc, tsc_khz);
+               return 0;
         }
  
-       return pmc_now * tsc_khz / (tsc_now - tsc_start);
+       schedule_delayed_work(&tsc_irqwork, 0);
+       return 0;
  }
-#else
-static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
-#endif
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
  
  void __init tsc_init(void)
  {
@@ -929,10 +970,6 @@ void __init tsc_init(void)
                 return;
         }
  
-       if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-                       (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
-               cpu_khz = calibrate_cpu();
-
         printk("Detected %lu.%03lu MHz processor.\n",
                         (unsigned long)cpu_khz / 1000,
                         (unsigned long)cpu_khz % 1000);
@@ -952,18 +989,18 @@ void __init tsc_init(void)
         /* now allow native_sched_clock() to use rdtsc */
         tsc_disabled = 0;
  
+       if (!no_sched_irq_time)
+               enable_sched_clock_irqtime();
+
         lpj = ((u64)tsc_khz * 1000);
         do_div(lpj, HZ);
         lpj_fine = lpj;
  
         use_tsc_delay();
-       /* Check and install the TSC clocksource */
-       dmi_check_system(bad_tsc_dmi_table);
  
         if (unsynchronized_tsc())
                 mark_tsc_unstable("TSCs unsynchronized");
  
         check_system_tsc_reliable();
-       init_tsc_clocksource();
  }