Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / arch / x86 / kernel / time-xen.c
1 /*
2  *  Copyright (c) 1991,1992,1995  Linus Torvalds
3  *  Copyright (c) 1994  Alan Modra
4  *  Copyright (c) 1995  Markus Kuhn
5  *  Copyright (c) 1996  Ingo Molnar
6  *  Copyright (c) 1998  Andrea Arcangeli
7  *  Copyright (c) 2002,2006  Vojtech Pavlik
8  *  Copyright (c) 2003  Andi Kleen
9  *
10  */
11
12 #include <linux/init.h>
13 #include <linux/interrupt.h>
14 #include <linux/time.h>
15 #include <linux/export.h>
16 #include <linux/sysctl.h>
17 #include <linux/percpu.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/posix-timers.h>
20 #include <linux/cpufreq.h>
21 #include <linux/clocksource.h>
22
23 #include <asm/vsyscall.h>
24 #include <asm/delay.h>
25 #include <asm/time.h>
26 #include <asm/timer.h>
27
28 #include <xen/clock.h>
29 #include <xen/sysctl.h>
30 #include <xen/interface/vcpu.h>
31
32 #ifdef CONFIG_X86_64
33 DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
34 #endif
35
36 #define XEN_SHIFT 22
37
38 unsigned int cpu_khz;   /* Detected as we calibrate the TSC */
39 EXPORT_SYMBOL(cpu_khz);
40
41 /* These are peridically updated in shared_info, and then copied here. */
42 struct shadow_time_info {
43         u64 tsc_timestamp;     /* TSC at last update of time vals.  */
44         u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
45         u32 tsc_to_nsec_mul;
46         u32 tsc_to_usec_mul;
47         int tsc_shift;
48         u32 version;
49 };
50 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
51 static struct timespec shadow_tv;
52 static u32 shadow_tv_version;
53
54 static u64 jiffies_bias, system_time_bias;
55
56 /* Current runstate of each CPU (updated automatically by the hypervisor). */
57 DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
58
59 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
60 #define NS_PER_TICK (1000000000LL/HZ)
61
62 /* Does this guest OS track Xen time, or set its wall clock independently? */
63 static int independent_wallclock = 0;
64 static int __init __independent_wallclock(char *str)
65 {
66         independent_wallclock = 1;
67         return 1;
68 }
69 __setup("independent_wallclock", __independent_wallclock);
70
71 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
72 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
73 static int __init __permitted_clock_jitter(char *str)
74 {
75         permitted_clock_jitter = simple_strtoul(str, NULL, 0);
76         return 1;
77 }
78 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
79
80 /*
81  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
82  * yielding a 64-bit result.
83  */
84 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
85 {
86         u64 product;
87 #ifdef __i386__
88         u32 tmp1, tmp2;
89 #endif
90
91         if (shift < 0)
92                 delta >>= -shift;
93         else
94                 delta <<= shift;
95
96 #ifdef __i386__
97         __asm__ (
98                 "mul  %5       ; "
99                 "mov  %4,%%eax ; "
100                 "mov  %%edx,%4 ; "
101                 "mul  %5       ; "
102                 "xor  %5,%5    ; "
103                 "add  %4,%%eax ; "
104                 "adc  %5,%%edx ; "
105                 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
106                 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
107 #else
108         __asm__ (
109                 "mul %%rdx ; shrd $32,%%rdx,%%rax"
110                 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
111 #endif
112
113         return product;
114 }
115
116 static void init_cpu_khz(void)
117 {
118         u64 __cpu_khz = 1000000ULL << 32;
119         struct vcpu_time_info *info = &vcpu_info(0)->time;
120         do_div(__cpu_khz, info->tsc_to_system_mul);
121         if (info->tsc_shift < 0)
122                 cpu_khz = __cpu_khz << -info->tsc_shift;
123         else
124                 cpu_khz = __cpu_khz >> info->tsc_shift;
125 }
126
127 static u64 get_nsec_offset(struct shadow_time_info *shadow)
128 {
129         u64 now, delta;
130         rdtscll(now);
131         delta = now - shadow->tsc_timestamp;
132         return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
133 }
134
135 static inline u64 processed_system_time(u64 jiffies_64)
136 {
137         return (jiffies_64 - jiffies_bias) * NS_PER_TICK + system_time_bias;
138 }
139
140 static void update_wallclock(bool local)
141 {
142         static DEFINE_MUTEX(uwc_mutex);
143         shared_info_t *s = HYPERVISOR_shared_info;
144
145         mutex_lock(&uwc_mutex);
146
147         do {
148                 shadow_tv_version = s->wc_version;
149                 rmb();
150                 shadow_tv.tv_sec  = s->wc_sec;
151                 shadow_tv.tv_nsec = s->wc_nsec;
152                 rmb();
153         } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
154
155         if (local) {
156                 u64 tmp = processed_system_time(get_jiffies_64());
157                 long nsec = do_div(tmp, NSEC_PER_SEC);
158                 struct timespec tv;
159
160                 set_normalized_timespec(&tv, shadow_tv.tv_sec + tmp,
161                                         shadow_tv.tv_nsec + nsec);
162                 do_settimeofday(&tv);
163         }
164
165         mutex_unlock(&uwc_mutex);
166 }
167
168 static void _update_wallclock(struct work_struct *unused)
169 {
170         update_wallclock(true);
171 }
172 static DECLARE_WORK(update_wallclock_work, _update_wallclock);
173
174 void xen_check_wallclock_update(void)
175 {
176         if (shadow_tv_version != HYPERVISOR_shared_info->wc_version
177             && !is_initial_xendomain() && !independent_wallclock
178             && keventd_up())
179                 schedule_work(&update_wallclock_work);
180 }
181
182 /*
183  * Reads a consistent set of time-base values from Xen, into a shadow data
184  * area.
185  */
186 static void get_time_values_from_xen(unsigned int cpu)
187 {
188         struct vcpu_time_info   *src;
189         struct shadow_time_info *dst;
190         unsigned long flags;
191         u32 pre_version, post_version;
192
193         src = &vcpu_info(cpu)->time;
194         dst = &per_cpu(shadow_time, cpu);
195
196         local_irq_save(flags);
197
198         do {
199                 pre_version = dst->version = src->version;
200                 rmb();
201                 dst->tsc_timestamp     = src->tsc_timestamp;
202                 dst->system_timestamp  = src->system_time;
203                 dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
204                 dst->tsc_shift         = src->tsc_shift;
205                 rmb();
206                 post_version = src->version;
207         } while ((pre_version & 1) | (pre_version ^ post_version));
208
209         dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
210
211         local_irq_restore(flags);
212 }
213
214 static inline int time_values_up_to_date(void)
215 {
216         rmb();
217         return this_cpu_read(shadow_time.version) == vcpu_info_read(time.version);
218 }
219
220 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
221 int xen_update_wallclock(const struct timespec *tv)
222 {
223         struct timespec now;
224         s64 nsec;
225         struct shadow_time_info *shadow;
226         struct xen_platform_op op;
227
228         if (!is_initial_xendomain() || independent_wallclock)
229                 return -EPERM;
230
231         shadow = &__get_cpu_var(shadow_time);
232
233         /*
234          * Ensure we don't get blocked for a long time so that our time delta
235          * overflows. If that were to happen then our shadow time values would
236          * be stale, so we can retry with fresh ones.
237          */
238         for (;;) {
239                 nsec = tv->tv_nsec - get_nsec_offset(shadow);
240                 if (time_values_up_to_date())
241                         break;
242                 get_time_values_from_xen(smp_processor_id());
243         }
244         set_normalized_timespec(&now, tv->tv_sec, nsec);
245
246         op.cmd = XENPF_settime;
247         op.u.settime.secs        = now.tv_sec;
248         op.u.settime.nsecs       = now.tv_nsec;
249         op.u.settime.system_time = shadow->system_timestamp;
250         WARN_ON(HYPERVISOR_platform_op(&op));
251         update_wallclock(false);
252
253         return 0;
254 }
255
256 static void sync_xen_wallclock(unsigned long dummy);
257 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
258 static void sync_xen_wallclock(unsigned long dummy)
259 {
260         struct timespec now, ignore;
261         struct xen_platform_op op;
262
263         BUG_ON(!is_initial_xendomain());
264         if (!ntp_synced() || independent_wallclock)
265                 return;
266
267         get_xtime_and_monotonic_and_sleep_offset(&now, &ignore, &ignore);
268         set_normalized_timespec(&now, now.tv_sec, now.tv_nsec);
269
270         op.cmd = XENPF_settime;
271         op.u.settime.secs        = now.tv_sec;
272         op.u.settime.nsecs       = now.tv_nsec;
273         op.u.settime.system_time = processed_system_time(get_jiffies_64());
274         WARN_ON(HYPERVISOR_platform_op(&op));
275
276         update_wallclock(false);
277
278         /* Once per minute. */
279         mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
280 }
281 #endif /* CONFIG_XEN_PRIVILEGED_GUEST */
282
283 unsigned long long xen_local_clock(void)
284 {
285         unsigned int cpu = get_cpu();
286         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
287         u64 time;
288         u32 local_time_version;
289
290         do {
291                 local_time_version = shadow->version;
292                 rdtsc_barrier();
293                 time = shadow->system_timestamp + get_nsec_offset(shadow);
294                 if (!time_values_up_to_date())
295                         get_time_values_from_xen(cpu);
296                 barrier();
297         } while (local_time_version != shadow->version);
298
299         put_cpu();
300
301         return time;
302 }
303
304 unsigned long xen_read_wallclock(void)
305 {
306         const shared_info_t *s = HYPERVISOR_shared_info;
307         u32 version, sec, nsec;
308         u64 delta;
309
310         do {
311                 version = s->wc_version;
312                 rmb();
313                 sec     = s->wc_sec;
314                 nsec    = s->wc_nsec;
315                 rmb();
316         } while ((s->wc_version & 1) | (version ^ s->wc_version));
317
318         delta = xen_local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
319         do_div(delta, NSEC_PER_SEC);
320
321         return delta;
322 }
323
324 int xen_write_wallclock(unsigned long now)
325 {
326         if (!is_initial_xendomain() || independent_wallclock)
327                 return 0;
328
329 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
330         mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
331 #endif
332
333         return mach_set_rtc_mmss(now);
334 }
335
336 /*
337  * Runstate accounting
338  */
339 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
340 {
341         u64 state_time;
342         struct vcpu_runstate_info *state;
343
344         BUG_ON(preemptible());
345
346         state = &__get_cpu_var(runstate);
347
348         do {
349                 state_time = get_64bit_local(&state->state_entry_time);
350                 *res = *state;
351         } while (get_64bit_local(&state->state_entry_time) != state_time);
352
353         WARN_ON_ONCE(res->state != RUNSTATE_running);
354 }
355
356 /*
357  * Xen sched_clock implementation.  Returns the number of unstolen
358  * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
359  * states.
360  */
361 unsigned long long sched_clock(void)
362 {
363         struct vcpu_runstate_info runstate;
364         cycle_t now;
365         u64 ret;
366         s64 offset;
367
368         /*
369          * Ideally sched_clock should be called on a per-cpu basis
370          * anyway, so preempt should already be disabled, but that's
371          * not current practice at the moment.
372          */
373         preempt_disable();
374
375         now = xen_local_clock();
376
377         get_runstate_snapshot(&runstate);
378
379         offset = now - runstate.state_entry_time;
380         if (offset < 0)
381                 offset = 0;
382
383         ret = offset + runstate.time[RUNSTATE_running]
384               + runstate.time[RUNSTATE_blocked];
385
386         preempt_enable();
387
388         return ret;
389 }
390
391 unsigned long profile_pc(struct pt_regs *regs)
392 {
393         unsigned long pc = instruction_pointer(regs);
394
395         if (!user_mode_vm(regs) && in_lock_functions(pc)) {
396 #ifdef CONFIG_FRAME_POINTER
397                 return *(unsigned long *)(regs->bp + sizeof(long));
398 #else
399                 unsigned long *sp =
400                         (unsigned long *)kernel_stack_pointer(regs);
401
402                 /*
403                  * Return address is either directly at stack pointer
404                  * or above a saved flags. Eflags has bits 22-31 zero,
405                  * kernel addresses don't.
406                  */
407                 if (sp[0] >> 22)
408                         return sp[0];
409                 if (sp[1] >> 22)
410                         return sp[1];
411 #endif
412         }
413
414         return pc;
415 }
416 EXPORT_SYMBOL(profile_pc);
417
418 void mark_tsc_unstable(char *reason)
419 {
420 #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
421         tsc_unstable = 1;
422 #endif
423 }
424 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
425
426 static cycle_t cs_last;
427
428 static cycle_t xen_clocksource_read(struct clocksource *cs)
429 {
430 #ifdef CONFIG_SMP
431         cycle_t last = get_64bit(&cs_last);
432         cycle_t ret = xen_local_clock();
433
434         if (unlikely((s64)(ret - last) < 0)) {
435                 if (last - ret > permitted_clock_jitter
436                     && printk_ratelimit()) {
437                         unsigned int cpu = get_cpu();
438                         struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
439
440                         printk(KERN_WARNING "clocksource/%u: "
441                                "Time went backwards: "
442                                "ret=%Lx delta=%Ld shadow=%Lx offset=%Lx\n",
443                                cpu, ret, ret - last, shadow->system_timestamp,
444                                get_nsec_offset(shadow));
445                         put_cpu();
446                 }
447                 return last;
448         }
449
450         for (;;) {
451                 cycle_t cur = cmpxchg64(&cs_last, last, ret);
452
453                 if (cur == last || (s64)(ret - cur) < 0)
454                         return ret;
455                 last = cur;
456         }
457 #else
458         return xen_local_clock();
459 #endif
460 }
461
462 /* No locking required. Interrupts are disabled on all CPUs. */
463 static void xen_clocksource_resume(struct clocksource *cs)
464 {
465         unsigned int cpu;
466
467         init_cpu_khz();
468
469         for_each_online_cpu(cpu)
470                 get_time_values_from_xen(cpu);
471
472         jiffies_bias = get_jiffies_64();
473         system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
474
475         cs_last = xen_local_clock();
476 }
477
478 static struct clocksource clocksource_xen = {
479         .name                   = "xen",
480         .rating                 = 400,
481         .read                   = xen_clocksource_read,
482         .mask                   = CLOCKSOURCE_MASK(64),
483         .mult                   = 1 << XEN_SHIFT,               /* time directly in nanoseconds */
484         .shift                  = XEN_SHIFT,
485         .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
486         .resume                 = xen_clocksource_resume,
487 };
488
489 void setup_runstate_area(unsigned int cpu)
490 {
491         struct vcpu_register_runstate_memory_area area;
492         struct vcpu_runstate_info *rs = &per_cpu(runstate, cpu);
493         int rc;
494
495         set_xen_guest_handle(area.addr.h, rs);
496         rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
497         if (rc) {
498                 BUILD_BUG_ON(RUNSTATE_running);
499                 memset(rs, 0, sizeof(*rs));
500                 WARN_ON(rc != -ENOSYS);
501         }
502 }
503
504 static void __init _late_time_init(void)
505 {
506         update_wallclock(false);
507         xen_clockevents_init();
508 }
509
510 void __init time_init(void)
511 {
512         init_cpu_khz();
513         printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
514                cpu_khz / 1000, cpu_khz % 1000);
515
516         setup_runstate_area(0);
517         get_time_values_from_xen(0);
518
519         jiffies_bias     = jiffies_64;
520         system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
521
522         clocksource_register_hz(&clocksource_xen, NSEC_PER_SEC);
523
524         use_tsc_delay();
525
526         /*
527          * Cannot request_irq() until kmem is initialised, and cannot
528          * do_settimeofday() (i.e. clock_was_set()) until interrupts are on.
529          */
530         late_time_init = _late_time_init;
531 }
532
533 /* Convert jiffies to system time. */
534 u64 jiffies_to_st(unsigned long j)
535 {
536         u64 j64 = get_jiffies_64();
537         long delta = j - (unsigned long)j64;
538
539         if (delta < 1)
540                 /* Triggers in some wrap-around cases, but that's okay:
541                  * we just end up with a shorter timeout. */
542                 return processed_system_time(j64) + NS_PER_TICK;
543
544         if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0)
545                 /* Very long timeout means there is no pending timer.
546                  * We indicate this to Xen by passing zero timeout. */
547                 return 0;
548
549         return processed_system_time(j64) + delta * (u64)NS_PER_TICK;
550 }
551 EXPORT_SYMBOL(jiffies_to_st);
552
553 #ifdef CONFIG_CPU_FREQ
554 static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 
555                                 void *data)
556 {
557         struct cpufreq_freqs *freq = data;
558         struct xen_platform_op op;
559
560         if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
561                 return 0;
562
563         if (val == CPUFREQ_PRECHANGE)
564                 return 0;
565
566         op.cmd = XENPF_change_freq;
567         op.u.change_freq.flags = 0;
568         op.u.change_freq.cpu = freq->cpu;
569         op.u.change_freq.freq = (u64)freq->new * 1000;
570         WARN_ON(HYPERVISOR_platform_op(&op));
571
572         return 0;
573 }
574
575 static struct notifier_block time_cpufreq_notifier_block = {
576         .notifier_call = time_cpufreq_notifier
577 };
578
579 static int __init cpufreq_time_setup(void)
580 {
581         if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
582                         CPUFREQ_TRANSITION_NOTIFIER)) {
583                 printk(KERN_ERR "failed to set up cpufreq notifier\n");
584                 return -ENODEV;
585         }
586         return 0;
587 }
588
589 core_initcall(cpufreq_time_setup);
590 #endif
591
592 /*
593  * /proc/sys/xen: This really belongs in another file. It can stay here for
594  * now however.
595  */
596 static ctl_table xen_subtable[] = {
597         {
598                 .procname       = "independent_wallclock",
599                 .data           = &independent_wallclock,
600                 .maxlen         = sizeof(independent_wallclock),
601                 .mode           = 0644,
602                 .proc_handler   = proc_dointvec
603         },
604         {
605                 .procname       = "permitted_clock_jitter",
606                 .data           = &permitted_clock_jitter,
607                 .maxlen         = sizeof(permitted_clock_jitter),
608                 .mode           = 0644,
609                 .proc_handler   = proc_doulongvec_minmax
610         },
611         { }
612 };
613 static ctl_table xen_table[] = {
614         {
615                 .procname       = "xen",
616                 .mode           = 0555,
617                 .child          = xen_subtable
618         },
619         { }
620 };
621 static int __init xen_sysctl_init(void)
622 {
623         (void)register_sysctl_table(xen_table);
624         return 0;
625 }
626 __initcall(xen_sysctl_init);