2 * Copyright (c) 1991,1992,1995 Linus Torvalds
3 * Copyright (c) 1994 Alan Modra
4 * Copyright (c) 1995 Markus Kuhn
5 * Copyright (c) 1996 Ingo Molnar
6 * Copyright (c) 1998 Andrea Arcangeli
7 * Copyright (c) 2002,2006 Vojtech Pavlik
8 * Copyright (c) 2003 Andi Kleen
12 #include <linux/init.h>
13 #include <linux/interrupt.h>
14 #include <linux/time.h>
15 #include <linux/export.h>
16 #include <linux/sysctl.h>
17 #include <linux/percpu.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/posix-timers.h>
20 #include <linux/cpufreq.h>
21 #include <linux/clocksource.h>
23 #include <asm/vsyscall.h>
24 #include <asm/delay.h>
26 #include <asm/timer.h>
28 #include <xen/clock.h>
29 #include <xen/sysctl.h>
30 #include <xen/interface/vcpu.h>
33 DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
38 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
39 EXPORT_SYMBOL(cpu_khz);
41 /* These are peridically updated in shared_info, and then copied here. */
42 struct shadow_time_info {
43 u64 tsc_timestamp; /* TSC at last update of time vals. */
44 u64 system_timestamp; /* Time, in nanosecs, since boot. */
50 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
51 static struct timespec shadow_tv;
52 static u32 shadow_tv_version;
54 static u64 jiffies_bias, system_time_bias;
56 /* Current runstate of each CPU (updated automatically by the hypervisor). */
57 DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
59 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
60 #define NS_PER_TICK (1000000000LL/HZ)
62 /* Does this guest OS track Xen time, or set its wall clock independently? */
63 static int independent_wallclock = 0;
64 static int __init __independent_wallclock(char *str)
66 independent_wallclock = 1;
69 __setup("independent_wallclock", __independent_wallclock);
71 /* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
72 static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
73 static int __init __permitted_clock_jitter(char *str)
75 permitted_clock_jitter = simple_strtoul(str, NULL, 0);
78 __setup("permitted_clock_jitter=", __permitted_clock_jitter);
81 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
82 * yielding a 64-bit result.
84 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
105 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
106 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
109 "mul %%rdx ; shrd $32,%%rdx,%%rax"
110 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
116 static void init_cpu_khz(void)
118 u64 __cpu_khz = 1000000ULL << 32;
119 struct vcpu_time_info *info = &vcpu_info(0)->time;
120 do_div(__cpu_khz, info->tsc_to_system_mul);
121 if (info->tsc_shift < 0)
122 cpu_khz = __cpu_khz << -info->tsc_shift;
124 cpu_khz = __cpu_khz >> info->tsc_shift;
127 static u64 get_nsec_offset(struct shadow_time_info *shadow)
131 delta = now - shadow->tsc_timestamp;
132 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
135 static inline u64 processed_system_time(u64 jiffies_64)
137 return (jiffies_64 - jiffies_bias) * NS_PER_TICK + system_time_bias;
140 static void update_wallclock(bool local)
142 static DEFINE_MUTEX(uwc_mutex);
143 shared_info_t *s = HYPERVISOR_shared_info;
145 mutex_lock(&uwc_mutex);
148 shadow_tv_version = s->wc_version;
150 shadow_tv.tv_sec = s->wc_sec;
151 shadow_tv.tv_nsec = s->wc_nsec;
153 } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
156 u64 tmp = processed_system_time(get_jiffies_64());
157 long nsec = do_div(tmp, NSEC_PER_SEC);
160 set_normalized_timespec(&tv, shadow_tv.tv_sec + tmp,
161 shadow_tv.tv_nsec + nsec);
162 do_settimeofday(&tv);
165 mutex_unlock(&uwc_mutex);
168 static void _update_wallclock(struct work_struct *unused)
170 update_wallclock(true);
172 static DECLARE_WORK(update_wallclock_work, _update_wallclock);
174 void xen_check_wallclock_update(void)
176 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version
177 && !is_initial_xendomain() && !independent_wallclock
179 schedule_work(&update_wallclock_work);
183 * Reads a consistent set of time-base values from Xen, into a shadow data
186 static void get_time_values_from_xen(unsigned int cpu)
188 struct vcpu_time_info *src;
189 struct shadow_time_info *dst;
191 u32 pre_version, post_version;
193 src = &vcpu_info(cpu)->time;
194 dst = &per_cpu(shadow_time, cpu);
196 local_irq_save(flags);
199 pre_version = dst->version = src->version;
201 dst->tsc_timestamp = src->tsc_timestamp;
202 dst->system_timestamp = src->system_time;
203 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
204 dst->tsc_shift = src->tsc_shift;
206 post_version = src->version;
207 } while ((pre_version & 1) | (pre_version ^ post_version));
209 dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
211 local_irq_restore(flags);
214 static inline int time_values_up_to_date(void)
217 return this_cpu_read(shadow_time.version) == vcpu_info_read(time.version);
220 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
221 int xen_update_wallclock(const struct timespec *tv)
225 struct shadow_time_info *shadow;
226 struct xen_platform_op op;
228 if (!is_initial_xendomain() || independent_wallclock)
231 shadow = &__get_cpu_var(shadow_time);
234 * Ensure we don't get blocked for a long time so that our time delta
235 * overflows. If that were to happen then our shadow time values would
236 * be stale, so we can retry with fresh ones.
239 nsec = tv->tv_nsec - get_nsec_offset(shadow);
240 if (time_values_up_to_date())
242 get_time_values_from_xen(smp_processor_id());
244 set_normalized_timespec(&now, tv->tv_sec, nsec);
246 op.cmd = XENPF_settime;
247 op.u.settime.secs = now.tv_sec;
248 op.u.settime.nsecs = now.tv_nsec;
249 op.u.settime.system_time = shadow->system_timestamp;
250 WARN_ON(HYPERVISOR_platform_op(&op));
251 update_wallclock(false);
256 static void sync_xen_wallclock(unsigned long dummy);
257 static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
258 static void sync_xen_wallclock(unsigned long dummy)
260 struct timespec now, ignore;
261 struct xen_platform_op op;
263 BUG_ON(!is_initial_xendomain());
264 if (!ntp_synced() || independent_wallclock)
267 get_xtime_and_monotonic_and_sleep_offset(&now, &ignore, &ignore);
268 set_normalized_timespec(&now, now.tv_sec, now.tv_nsec);
270 op.cmd = XENPF_settime;
271 op.u.settime.secs = now.tv_sec;
272 op.u.settime.nsecs = now.tv_nsec;
273 op.u.settime.system_time = processed_system_time(get_jiffies_64());
274 WARN_ON(HYPERVISOR_platform_op(&op));
276 update_wallclock(false);
278 /* Once per minute. */
279 mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
281 #endif /* CONFIG_XEN_PRIVILEGED_GUEST */
283 unsigned long long xen_local_clock(void)
285 unsigned int cpu = get_cpu();
286 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
288 u32 local_time_version;
291 local_time_version = shadow->version;
293 time = shadow->system_timestamp + get_nsec_offset(shadow);
294 if (!time_values_up_to_date())
295 get_time_values_from_xen(cpu);
297 } while (local_time_version != shadow->version);
304 unsigned long xen_read_wallclock(void)
306 const shared_info_t *s = HYPERVISOR_shared_info;
307 u32 version, sec, nsec;
311 version = s->wc_version;
316 } while ((s->wc_version & 1) | (version ^ s->wc_version));
318 delta = xen_local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
319 do_div(delta, NSEC_PER_SEC);
324 int xen_write_wallclock(unsigned long now)
326 if (!is_initial_xendomain() || independent_wallclock)
329 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
330 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
333 return mach_set_rtc_mmss(now);
337 * Runstate accounting
339 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
342 struct vcpu_runstate_info *state;
344 BUG_ON(preemptible());
346 state = &__get_cpu_var(runstate);
349 state_time = get_64bit_local(&state->state_entry_time);
351 } while (get_64bit_local(&state->state_entry_time) != state_time);
353 WARN_ON_ONCE(res->state != RUNSTATE_running);
357 * Xen sched_clock implementation. Returns the number of unstolen
358 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
361 unsigned long long sched_clock(void)
363 struct vcpu_runstate_info runstate;
369 * Ideally sched_clock should be called on a per-cpu basis
370 * anyway, so preempt should already be disabled, but that's
371 * not current practice at the moment.
375 now = xen_local_clock();
377 get_runstate_snapshot(&runstate);
379 offset = now - runstate.state_entry_time;
383 ret = offset + runstate.time[RUNSTATE_running]
384 + runstate.time[RUNSTATE_blocked];
391 unsigned long profile_pc(struct pt_regs *regs)
393 unsigned long pc = instruction_pointer(regs);
395 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
396 #ifdef CONFIG_FRAME_POINTER
397 return *(unsigned long *)(regs->bp + sizeof(long));
400 (unsigned long *)kernel_stack_pointer(regs);
403 * Return address is either directly at stack pointer
404 * or above a saved flags. Eflags has bits 22-31 zero,
405 * kernel addresses don't.
416 EXPORT_SYMBOL(profile_pc);
418 void mark_tsc_unstable(char *reason)
420 #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
424 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
426 static cycle_t cs_last;
428 static cycle_t xen_clocksource_read(struct clocksource *cs)
431 cycle_t last = get_64bit(&cs_last);
432 cycle_t ret = xen_local_clock();
434 if (unlikely((s64)(ret - last) < 0)) {
435 if (last - ret > permitted_clock_jitter
436 && printk_ratelimit()) {
437 unsigned int cpu = get_cpu();
438 struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
440 printk(KERN_WARNING "clocksource/%u: "
441 "Time went backwards: "
442 "ret=%Lx delta=%Ld shadow=%Lx offset=%Lx\n",
443 cpu, ret, ret - last, shadow->system_timestamp,
444 get_nsec_offset(shadow));
451 cycle_t cur = cmpxchg64(&cs_last, last, ret);
453 if (cur == last || (s64)(ret - cur) < 0)
458 return xen_local_clock();
462 /* No locking required. Interrupts are disabled on all CPUs. */
463 static void xen_clocksource_resume(struct clocksource *cs)
469 for_each_online_cpu(cpu)
470 get_time_values_from_xen(cpu);
472 jiffies_bias = get_jiffies_64();
473 system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
475 cs_last = xen_local_clock();
478 static struct clocksource clocksource_xen = {
481 .read = xen_clocksource_read,
482 .mask = CLOCKSOURCE_MASK(64),
483 .mult = 1 << XEN_SHIFT, /* time directly in nanoseconds */
485 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
486 .resume = xen_clocksource_resume,
489 void setup_runstate_area(unsigned int cpu)
491 struct vcpu_register_runstate_memory_area area;
492 struct vcpu_runstate_info *rs = &per_cpu(runstate, cpu);
495 set_xen_guest_handle(area.addr.h, rs);
496 rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
498 BUILD_BUG_ON(RUNSTATE_running);
499 memset(rs, 0, sizeof(*rs));
500 WARN_ON(rc != -ENOSYS);
504 static void __init _late_time_init(void)
506 update_wallclock(false);
507 xen_clockevents_init();
510 void __init time_init(void)
513 printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
514 cpu_khz / 1000, cpu_khz % 1000);
516 setup_runstate_area(0);
517 get_time_values_from_xen(0);
519 jiffies_bias = jiffies_64;
520 system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
522 clocksource_register_hz(&clocksource_xen, NSEC_PER_SEC);
527 * Cannot request_irq() until kmem is initialised, and cannot
528 * do_settimeofday() (i.e. clock_was_set()) until interrupts are on.
530 late_time_init = _late_time_init;
533 /* Convert jiffies to system time. */
534 u64 jiffies_to_st(unsigned long j)
536 u64 j64 = get_jiffies_64();
537 long delta = j - (unsigned long)j64;
540 /* Triggers in some wrap-around cases, but that's okay:
541 * we just end up with a shorter timeout. */
542 return processed_system_time(j64) + NS_PER_TICK;
544 if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0)
545 /* Very long timeout means there is no pending timer.
546 * We indicate this to Xen by passing zero timeout. */
549 return processed_system_time(j64) + delta * (u64)NS_PER_TICK;
551 EXPORT_SYMBOL(jiffies_to_st);
553 #ifdef CONFIG_CPU_FREQ
554 static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
557 struct cpufreq_freqs *freq = data;
558 struct xen_platform_op op;
560 if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
563 if (val == CPUFREQ_PRECHANGE)
566 op.cmd = XENPF_change_freq;
567 op.u.change_freq.flags = 0;
568 op.u.change_freq.cpu = freq->cpu;
569 op.u.change_freq.freq = (u64)freq->new * 1000;
570 WARN_ON(HYPERVISOR_platform_op(&op));
575 static struct notifier_block time_cpufreq_notifier_block = {
576 .notifier_call = time_cpufreq_notifier
579 static int __init cpufreq_time_setup(void)
581 if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
582 CPUFREQ_TRANSITION_NOTIFIER)) {
583 printk(KERN_ERR "failed to set up cpufreq notifier\n");
589 core_initcall(cpufreq_time_setup);
593 * /proc/sys/xen: This really belongs in another file. It can stay here for
596 static ctl_table xen_subtable[] = {
598 .procname = "independent_wallclock",
599 .data = &independent_wallclock,
600 .maxlen = sizeof(independent_wallclock),
602 .proc_handler = proc_dointvec
605 .procname = "permitted_clock_jitter",
606 .data = &permitted_clock_jitter,
607 .maxlen = sizeof(permitted_clock_jitter),
609 .proc_handler = proc_doulongvec_minmax
613 static ctl_table xen_table[] = {
617 .child = xen_subtable
621 static int __init xen_sysctl_init(void)
623 (void)register_sysctl_table(xen_table);
626 __initcall(xen_sysctl_init);